sglang 0.5.1.post2__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. sglang/bench_one_batch_server.py +79 -53
  2. sglang/bench_serving.py +186 -14
  3. sglang/profiler.py +0 -1
  4. sglang/srt/conversation.py +38 -5
  5. sglang/srt/entrypoints/engine.py +1 -1
  6. sglang/srt/entrypoints/openai/protocol.py +27 -24
  7. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  8. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  9. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  10. sglang/srt/function_call/function_call_parser.py +2 -0
  11. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  12. sglang/srt/harmony_parser.py +588 -0
  13. sglang/srt/hf_transformers_utils.py +16 -7
  14. sglang/srt/layers/attention/ascend_backend.py +218 -111
  15. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  16. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  18. sglang/srt/layers/communicator.py +1 -2
  19. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  20. sglang/srt/layers/moe/ep_moe/layer.py +1 -7
  21. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  22. sglang/srt/layers/moe/topk.py +1 -1
  23. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  24. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
  25. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  26. sglang/srt/layers/quantization/fp8.py +2 -1
  27. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  28. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  29. sglang/srt/layers/quantization/mxfp4.py +16 -23
  30. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  31. sglang/srt/layers/utils.py +0 -14
  32. sglang/srt/managers/cache_controller.py +223 -156
  33. sglang/srt/managers/detokenizer_manager.py +5 -0
  34. sglang/srt/managers/io_struct.py +30 -0
  35. sglang/srt/managers/scheduler.py +58 -7
  36. sglang/srt/managers/tokenizer_manager.py +36 -3
  37. sglang/srt/mem_cache/hicache_storage.py +31 -20
  38. sglang/srt/mem_cache/hiradix_cache.py +12 -3
  39. sglang/srt/mem_cache/memory_pool.py +73 -14
  40. sglang/srt/mem_cache/memory_pool_host.py +3 -2
  41. sglang/srt/mem_cache/radix_cache.py +1 -0
  42. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
  43. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
  44. sglang/srt/model_executor/model_runner.py +1 -1
  45. sglang/srt/models/deepseek_v2.py +12 -3
  46. sglang/srt/models/gpt_oss.py +2 -1
  47. sglang/srt/models/qwen2_5_vl.py +1 -0
  48. sglang/srt/reasoning_parser.py +56 -300
  49. sglang/srt/server_args.py +10 -1
  50. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  51. sglang/srt/utils.py +59 -5
  52. sglang/version.py +1 -1
  53. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +4 -3
  54. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +57 -54
  55. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
  56. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
  57. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ import json
18
18
  import multiprocessing
19
19
  import os
20
20
  import time
21
- from typing import Tuple
21
+ from typing import List, Tuple
22
22
 
23
23
  import requests
24
24
 
@@ -45,6 +45,7 @@ class BenchArgs:
45
45
  skip_warmup: bool = False
46
46
  show_report: bool = False
47
47
  profile: bool = False
48
+ profile_steps: int = 3
48
49
  profile_by_stage: bool = False
49
50
 
50
51
  @staticmethod
@@ -78,6 +79,9 @@ class BenchArgs:
78
79
  parser.add_argument("--skip-warmup", action="store_true")
79
80
  parser.add_argument("--show-report", action="store_true")
80
81
  parser.add_argument("--profile", action="store_true")
82
+ parser.add_argument(
83
+ "--profile-steps", type=int, default=BenchArgs.profile_steps
84
+ )
81
85
  parser.add_argument("--profile-by-stage", action="store_true")
82
86
 
83
87
  @classmethod
@@ -132,6 +136,7 @@ def run_one_case(
132
136
  result_filename: str,
133
137
  tokenizer,
134
138
  profile: bool = False,
139
+ profile_steps: int = 3,
135
140
  profile_by_stage: bool = False,
136
141
  ):
137
142
  requests.post(url + "/flush_cache")
@@ -162,7 +167,7 @@ def run_one_case(
162
167
  profile_link = None
163
168
  if profile:
164
169
  profile_link: str = run_profile(
165
- url, 3, ["CPU", "GPU"], None, None, profile_by_stage
170
+ url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
166
171
  )
167
172
 
168
173
  tic = time.perf_counter()
@@ -247,6 +252,71 @@ def run_one_case(
247
252
  )
248
253
 
249
254
 
255
+ def get_report_summary(
256
+ result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
257
+ ):
258
+ import tabulate
259
+
260
+ summary = (
261
+ f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
262
+ )
263
+
264
+ headers = [
265
+ "batch size",
266
+ "latency (s)",
267
+ "input throughput (tok/s)",
268
+ "output throughput (tok/s)",
269
+ "acc length",
270
+ "ITL (ms)",
271
+ "input cost ($/1M)",
272
+ "output cost ($/1M)",
273
+ ]
274
+ if bench_args.profile:
275
+ headers.append("profile")
276
+ rows = []
277
+
278
+ for (
279
+ batch_size,
280
+ latency,
281
+ ttft,
282
+ input_throughput,
283
+ output_throughput,
284
+ _,
285
+ _,
286
+ acc_length,
287
+ trace_link,
288
+ ) in result:
289
+ if is_blackwell():
290
+ hourly_cost_per_gpu = 4 # $4/hour for one B200
291
+ else:
292
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
293
+
294
+ hourly_cost = hourly_cost_per_gpu * server_args.tp_size
295
+ input_util = 0.7
296
+ accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
297
+ itl = 1 / (output_throughput / batch_size) * 1000
298
+ input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
299
+ output_cost = 1e6 / output_throughput / 3600 * hourly_cost
300
+ row = [
301
+ batch_size,
302
+ latency,
303
+ input_throughput,
304
+ output_throughput,
305
+ accept_length,
306
+ itl,
307
+ input_cost,
308
+ output_cost,
309
+ ]
310
+ if trace_link:
311
+ row.append(f"[Profile]({trace_link})")
312
+ rows.append(row)
313
+
314
+ summary += tabulate.tabulate(
315
+ rows, headers=headers, tablefmt="github", floatfmt=".2f"
316
+ )
317
+ return summary
318
+
319
+
250
320
  def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
251
321
  if bench_args.base_url:
252
322
  proc, base_url = None, bench_args.base_url
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
321
391
  result_filename=bench_args.result_filename,
322
392
  tokenizer=tokenizer,
323
393
  profile=bench_args.profile,
394
+ profile_steps=bench_args.profile_steps,
324
395
  profile_by_stage=bench_args.profile_by_stage,
325
396
  )[-1],
326
397
  )
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
337
408
  if not bench_args.show_report:
338
409
  return
339
410
 
340
- summary = (
341
- f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
342
- )
343
- summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
344
-
345
- if bench_args.profile:
346
- summary += " profile |"
347
-
348
- summary += "\n"
349
- summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
350
-
351
- if bench_args.profile:
352
- summary += "-------------|"
353
- summary += "\n"
354
-
355
- for (
356
- batch_size,
357
- latency,
358
- ttft,
359
- input_throughput,
360
- output_throughput,
361
- overall_throughput,
362
- last_gen_throughput,
363
- acc_length,
364
- trace_link,
365
- ) in result:
366
- if is_blackwell():
367
- hourly_cost_per_gpu = 4 # $4/hour for one B200
368
- else:
369
- hourly_cost_per_gpu = 2 # $2/hour for one H100
370
-
371
- hourly_cost = hourly_cost_per_gpu * server_args.tp_size
372
- input_util = 0.7
373
- accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
374
- line = (
375
- f"| {batch_size} | "
376
- f"{latency:.2f} | "
377
- f"{input_throughput:.2f} | "
378
- f"{output_throughput:.2f} | "
379
- f"{accept_length} | "
380
- f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
381
- f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
382
- f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
383
- )
384
- if trace_link:
385
- line += f" [Profile]({trace_link}) |"
386
- line += "\n"
387
- summary += line
388
-
389
- # print metrics table
411
+ summary = get_report_summary(result, server_args, bench_args)
390
412
  print(summary)
391
413
 
392
414
  if is_in_ci():
393
415
  write_github_step_summary(summary)
394
416
 
395
417
 
396
- if __name__ == "__main__":
418
+ def main():
397
419
  parser = argparse.ArgumentParser()
398
420
  ServerArgs.add_cli_args(parser)
399
421
  BenchArgs.add_cli_args(parser)
@@ -402,3 +424,7 @@ if __name__ == "__main__":
402
424
  bench_args = BenchArgs.from_cli_args(args)
403
425
 
404
426
  run_benchmark(server_args, bench_args)
427
+
428
+
429
+ if __name__ == "__main__":
430
+ main()
sglang/bench_serving.py CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
+ import base64
16
+ import io
15
17
  import json
16
18
  import os
17
19
  import pickle
@@ -71,7 +73,7 @@ class RequestFuncInput:
71
73
  output_len: int
72
74
  model: str
73
75
  lora_name: str
74
- image_data: str
76
+ image_data: Optional[List[str]]
75
77
  extra_request_body: Dict[str, Any]
76
78
 
77
79
 
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
289
291
  ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
290
292
 
291
293
  if request_func_input.image_data:
294
+ # Build multi-image content: a list of image_url entries followed by the text
295
+ content_items = [
296
+ {
297
+ "type": "image_url",
298
+ "image_url": {"url": img_url},
299
+ }
300
+ for img_url in request_func_input.image_data
301
+ ]
302
+ content_items.append({"type": "text", "text": request_func_input.prompt})
292
303
  messages = [
293
304
  {
294
305
  "role": "user",
295
- "content": [
296
- {
297
- "type": "image_url",
298
- "image_url": {"url": request_func_input.image_data},
299
- },
300
- {"type": "text", "text": request_func_input.prompt},
301
- ],
306
+ "content": content_items,
302
307
  },
303
308
  ]
304
309
  else:
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
497
502
  **request_func_input.extra_request_body,
498
503
  }
499
504
 
500
- # Add image data if available
505
+ # Add image data if available (list of image urls/base64)
501
506
  if request_func_input.image_data:
502
507
  payload["image_data"] = request_func_input.image_data
503
508
 
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
648
653
  prompt_suffix=args.prompt_suffix,
649
654
  apply_chat_template=args.apply_chat_template,
650
655
  )
651
- elif args.dataset_name.startswith("random"):
656
+ elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
652
657
  input_requests = sample_random_requests(
653
658
  input_len=args.random_input_len,
654
659
  output_len=args.random_output_len,
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
659
664
  random_sample=args.dataset_name == "random",
660
665
  return_text=not tokenize_prompt,
661
666
  )
667
+ elif args.dataset_name == "random-image":
668
+ assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
669
+ input_requests = sample_random_image_requests(
670
+ num_requests=args.num_prompts,
671
+ num_images=args.random_image_num_images,
672
+ input_len=args.random_input_len,
673
+ output_len=args.random_output_len,
674
+ range_ratio=args.random_range_ratio,
675
+ tokenizer=tokenizer,
676
+ apply_chat_template=args.apply_chat_template,
677
+ image_resolution=args.random_image_resolution,
678
+ )
662
679
  elif args.dataset_name == "generated-shared-prefix":
663
680
  assert not tokenize_prompt
664
681
  input_requests = sample_generated_shared_prefix_requests(
@@ -790,7 +807,7 @@ class DatasetRow:
790
807
  prompt: str
791
808
  prompt_len: int
792
809
  output_len: int
793
- image_data: Optional[str] = None
810
+ image_data: Optional[List[str]] = None
794
811
 
795
812
 
796
813
  def sample_mmmu_requests(
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
913
930
  prompt=prompt,
914
931
  prompt_len=prompt_len,
915
932
  output_len=output_len,
916
- image_data=image_data,
933
+ image_data=[image_data],
917
934
  )
918
935
  )
919
936
 
@@ -1113,6 +1130,132 @@ def sample_random_requests(
1113
1130
  return input_requests
1114
1131
 
1115
1132
 
1133
+ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1134
+ """Parse image resolution into (width, height).
1135
+
1136
+ Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
1137
+ (e.g., '1080x1920' means height=1080, width=1920).
1138
+ """
1139
+ resolution_to_size = {
1140
+ "4k": (3840, 2160),
1141
+ "1080p": (1920, 1080),
1142
+ "720p": (1280, 720),
1143
+ "360p": (640, 360),
1144
+ }
1145
+ if image_resolution in resolution_to_size:
1146
+ return resolution_to_size[image_resolution]
1147
+
1148
+ res = image_resolution.strip().lower()
1149
+ if "x" in res:
1150
+ parts = res.split("x")
1151
+ if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
1152
+ height = int(parts[0])
1153
+ width = int(parts[1])
1154
+ if height > 0 and width > 0:
1155
+ return (width, height)
1156
+
1157
+ raise ValueError(
1158
+ f"Unsupported random-image resolution: {image_resolution}. "
1159
+ "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1160
+ )
1161
+
1162
+
1163
+ def sample_random_image_requests(
1164
+ num_requests: int,
1165
+ num_images: int,
1166
+ input_len: int,
1167
+ output_len: int,
1168
+ range_ratio: float,
1169
+ tokenizer: PreTrainedTokenizerBase,
1170
+ apply_chat_template: bool = True,
1171
+ image_resolution: str = "1080p",
1172
+ ) -> List[DatasetRow]:
1173
+ """Generate requests with random images.
1174
+
1175
+ - Each request includes ``num_images`` random images.
1176
+ - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1177
+ or custom 'heightxwidth' (e.g., 1080x1920).
1178
+ - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1179
+ only counts text tokens and excludes image data.
1180
+ """
1181
+ try:
1182
+ import pybase64
1183
+ from PIL import Image
1184
+ except ImportError as e:
1185
+ raise ImportError(
1186
+ "Please install Pillow to generate random images: pip install pillow"
1187
+ ) from e
1188
+
1189
+ # Parse resolution (supports presets and 'heightxwidth')
1190
+ width, height = parse_random_image_resolution(image_resolution)
1191
+
1192
+ # Check for potentially problematic combinations and warn user
1193
+ if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1194
+ warnings.warn(
1195
+ f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1196
+ f"may take a long time. Consider reducing resolution or image count.",
1197
+ UserWarning,
1198
+ stacklevel=2,
1199
+ )
1200
+
1201
+ # Sample text lengths
1202
+ input_lens = np.random.randint(
1203
+ max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
1204
+ )
1205
+ output_lens = np.random.randint(
1206
+ int(output_len * range_ratio), output_len + 1, size=num_requests
1207
+ )
1208
+
1209
+ def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1210
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1211
+ img = Image.fromarray(arr, mode="RGB")
1212
+ buf = io.BytesIO()
1213
+ img.save(buf, format="JPEG", quality=85)
1214
+ encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1215
+ return f"data:image/jpeg;base64,{encoded}"
1216
+
1217
+ dataset: List[DatasetRow] = []
1218
+ for i in range(num_requests):
1219
+ # Generate text prompt
1220
+ text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1221
+
1222
+ # Generate image list
1223
+ images = [_gen_random_image_data_uri() for _ in range(num_images)]
1224
+
1225
+ prompt_str = text_prompt
1226
+ if apply_chat_template:
1227
+ try:
1228
+ content_items = [
1229
+ {"type": "image_url", "image_url": {"url": img_url}}
1230
+ for img_url in images
1231
+ ]
1232
+ content_items.append({"type": "text", "text": text_prompt})
1233
+ prompt_str = tokenizer.apply_chat_template(
1234
+ [{"role": "user", "content": content_items}],
1235
+ add_generation_prompt=True,
1236
+ tokenize=False,
1237
+ )
1238
+ except Exception:
1239
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1240
+ prompt_str = f"<image>{text_prompt}"
1241
+
1242
+ prompt_token_ids = tokenizer.encode(prompt_str)
1243
+ prompt_token_len = len(prompt_token_ids)
1244
+
1245
+ dataset.append(
1246
+ DatasetRow(
1247
+ prompt=prompt_str,
1248
+ prompt_len=prompt_token_len,
1249
+ output_len=int(output_lens[i]),
1250
+ image_data=images,
1251
+ )
1252
+ )
1253
+
1254
+ print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1255
+ print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1256
+ return dataset
1257
+
1258
+
1116
1259
  def gen_prompt(tokenizer, token_num):
1117
1260
  """Generate a random prompt of specified token length using tokenizer vocabulary."""
1118
1261
  all_available_tokens = list(tokenizer.get_vocab().values())
@@ -1579,7 +1722,13 @@ async def benchmark(
1579
1722
  output_file_name = args.output_file
1580
1723
  else:
1581
1724
  now = datetime.now().strftime("%m%d")
1582
- if args.dataset_name.startswith("random"):
1725
+ if args.dataset_name == "random-image":
1726
+ output_file_name = (
1727
+ f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1728
+ f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1729
+ f"{args.random_image_resolution}.jsonl"
1730
+ )
1731
+ elif args.dataset_name.startswith("random"):
1583
1732
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1584
1733
  else:
1585
1734
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
1819
1968
  "--dataset-name",
1820
1969
  type=str,
1821
1970
  default="sharegpt",
1822
- choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
1971
+ choices=[
1972
+ "sharegpt",
1973
+ "random",
1974
+ "random-ids",
1975
+ "generated-shared-prefix",
1976
+ "mmmu",
1977
+ "random-image",
1978
+ ],
1823
1979
  help="Name of the dataset to benchmark on.",
1824
1980
  )
1825
1981
  parser.add_argument(
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
1872
2028
  help="Range of sampled ratio of input/output length, "
1873
2029
  "used only for random dataset.",
1874
2030
  )
2031
+ # random-image dataset args
2032
+ parser.add_argument(
2033
+ "--random-image-num-images",
2034
+ type=int,
2035
+ default=1,
2036
+ help="Number of images per request (only available with the random-image dataset)",
2037
+ )
2038
+ parser.add_argument(
2039
+ "--random-image-resolution",
2040
+ type=str,
2041
+ default="1080p",
2042
+ help=(
2043
+ "Resolution of random images for random-image dataset. "
2044
+ "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2045
+ ),
2046
+ )
1875
2047
  parser.add_argument(
1876
2048
  "--request-rate",
1877
2049
  type=float,
sglang/profiler.py CHANGED
@@ -9,7 +9,6 @@ import argparse
9
9
  import json
10
10
  import os
11
11
  import time
12
- import urllib.parse
13
12
  from argparse import ArgumentParser
14
13
  from pathlib import Path
15
14
  from typing import List, Optional
@@ -26,6 +26,8 @@ Key components:
26
26
  # Adapted from
27
27
  # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
28
28
  import dataclasses
29
+ import json
30
+ import os
29
31
  import re
30
32
  from enum import IntEnum, auto
31
33
  from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -959,16 +961,42 @@ register_conv_template(
959
961
  )
960
962
 
961
963
 
964
+ MODEL_TYPE_TO_TEMPLATE = {
965
+ "internvl_chat": "internvl-2-5",
966
+ "deepseek_vl_v2": "deepseek-vl2",
967
+ "multi_modality": "janus-pro",
968
+ "phi4mm": "phi-4-mm",
969
+ "minicpmv": "minicpmv",
970
+ "minicpmo": "minicpmo",
971
+ }
972
+
973
+
974
+ def get_model_type(model_path: str) -> Optional[str]:
975
+ config_path = os.path.join(model_path, "config.json")
976
+ if not os.path.exists(config_path):
977
+ return None
978
+ try:
979
+ with open(config_path, "r", encoding="utf-8") as f:
980
+ config = json.load(f)
981
+ return config.get("model_type")
982
+ except (IOError, json.JSONDecodeError):
983
+ return None
984
+
985
+
962
986
  @register_conv_template_matching_function
963
987
  def match_internvl(model_path: str):
964
988
  if re.search(r"internvl", model_path, re.IGNORECASE):
965
989
  return "internvl-2-5"
990
+ model_type = get_model_type(model_path)
991
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
966
992
 
967
993
 
968
994
  @register_conv_template_matching_function
969
995
  def match_deepseek_janus_pro(model_path: str):
970
996
  if re.search(r"janus", model_path, re.IGNORECASE):
971
997
  return "janus-pro"
998
+ model_type = get_model_type(model_path)
999
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
972
1000
 
973
1001
 
974
1002
  @register_conv_template_matching_function
@@ -981,6 +1009,8 @@ def match_vicuna(model_path: str):
981
1009
  def match_deepseek_vl(model_path: str):
982
1010
  if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
983
1011
  return "deepseek-vl2"
1012
+ model_type = get_model_type(model_path)
1013
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
984
1014
 
985
1015
 
986
1016
  @register_conv_template_matching_function
@@ -994,14 +1024,17 @@ def match_qwen_chat_ml(model_path: str):
994
1024
 
995
1025
 
996
1026
  @register_conv_template_matching_function
997
- def match_openbmb_minicpm(model_path: str):
998
- if re.search(r"minicpm-v", model_path, re.IGNORECASE):
999
- return "minicpmv"
1000
- elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
1001
- return "minicpmo"
1027
+ def match_minicpm(model_path: str):
1028
+ match = re.search(r"minicpm-(v|o)", model_path, re.IGNORECASE)
1029
+ if match:
1030
+ return f"minicpm{match.group(1).lower()}"
1031
+ model_type = get_model_type(model_path)
1032
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
1002
1033
 
1003
1034
 
1004
1035
  @register_conv_template_matching_function
1005
1036
  def match_phi_4_mm(model_path: str):
1006
1037
  if "phi-4-multimodal" in model_path.lower():
1007
1038
  return "phi-4-mm"
1039
+ model_type = get_model_type(model_path)
1040
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
@@ -680,7 +680,7 @@ def _set_envs_and_config(server_args: ServerArgs):
680
680
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
681
681
  assert_pkg_version(
682
682
  "sgl-kernel",
683
- "0.3.5",
683
+ "0.3.7",
684
684
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
685
685
  )
686
686
 
@@ -35,6 +35,8 @@ from pydantic import (
35
35
  )
36
36
  from typing_extensions import Literal
37
37
 
38
+ DEFAULT_MODEL_NAME = "default"
39
+
38
40
 
39
41
  class ModelCard(BaseModel):
40
42
  """Model cards."""
@@ -108,6 +110,23 @@ class JsonSchemaResponseFormat(BaseModel):
108
110
  strict: Optional[bool] = False
109
111
 
110
112
 
113
+ class ResponseFormat(BaseModel):
114
+ type: Literal["text", "json_object", "json_schema"]
115
+ json_schema: Optional[JsonSchemaResponseFormat] = None
116
+
117
+
118
+ class StructuresResponseFormat(BaseModel):
119
+ begin: str
120
+ schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
121
+ end: str
122
+
123
+
124
+ class StructuralTagResponseFormat(BaseModel):
125
+ type: Literal["structural_tag"]
126
+ structures: List[StructuresResponseFormat]
127
+ triggers: List[str]
128
+
129
+
111
130
  class FileRequest(BaseModel):
112
131
  # https://platform.openai.com/docs/api-reference/files/create
113
132
  file: bytes # The File object (not file name) to be uploaded
@@ -166,7 +185,7 @@ class BatchResponse(BaseModel):
166
185
  class CompletionRequest(BaseModel):
167
186
  # Ordered by official OpenAI API documentation
168
187
  # https://platform.openai.com/docs/api-reference/completions/create
169
- model: str
188
+ model: str = DEFAULT_MODEL_NAME
170
189
  prompt: Union[List[int], List[List[int]], str, List[str]]
171
190
  best_of: Optional[int] = None
172
191
  echo: bool = False
@@ -200,6 +219,7 @@ class CompletionRequest(BaseModel):
200
219
  skip_special_tokens: bool = True
201
220
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
202
221
  session_params: Optional[Dict] = None
222
+ response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
203
223
 
204
224
  # For PD disaggregation
205
225
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -327,7 +347,7 @@ class ToolCall(BaseModel):
327
347
 
328
348
 
329
349
  class ChatCompletionMessageGenericParam(BaseModel):
330
- role: Literal["system", "assistant", "tool"]
350
+ role: Literal["system", "assistant", "tool", "function"]
331
351
  content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
332
352
  default=None
333
353
  )
@@ -341,9 +361,9 @@ class ChatCompletionMessageGenericParam(BaseModel):
341
361
  def _normalize_role(cls, v):
342
362
  if isinstance(v, str):
343
363
  v_lower = v.lower()
344
- if v_lower not in {"system", "assistant", "tool"}:
364
+ if v_lower not in {"system", "assistant", "tool", "function"}:
345
365
  raise ValueError(
346
- "'role' must be one of 'system', 'assistant', or 'tool' (case-insensitive)."
366
+ "'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
347
367
  )
348
368
  return v_lower
349
369
  raise ValueError("'role' must be a string")
@@ -359,23 +379,6 @@ ChatCompletionMessageParam = Union[
359
379
  ]
360
380
 
361
381
 
362
- class ResponseFormat(BaseModel):
363
- type: Literal["text", "json_object", "json_schema"]
364
- json_schema: Optional[JsonSchemaResponseFormat] = None
365
-
366
-
367
- class StructuresResponseFormat(BaseModel):
368
- begin: str
369
- schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
370
- end: str
371
-
372
-
373
- class StructuralTagResponseFormat(BaseModel):
374
- type: Literal["structural_tag"]
375
- structures: List[StructuresResponseFormat]
376
- triggers: List[str]
377
-
378
-
379
382
  class Function(BaseModel):
380
383
  """Function descriptions."""
381
384
 
@@ -409,7 +412,7 @@ class ChatCompletionRequest(BaseModel):
409
412
  # Ordered by official OpenAI API documentation
410
413
  # https://platform.openai.com/docs/api-reference/chat/create
411
414
  messages: List[ChatCompletionMessageParam]
412
- model: str
415
+ model: str = DEFAULT_MODEL_NAME
413
416
  frequency_penalty: float = 0.0
414
417
  logit_bias: Optional[Dict[str, float]] = None
415
418
  logprobs: bool = False
@@ -571,7 +574,7 @@ class EmbeddingRequest(BaseModel):
571
574
  # Ordered by official OpenAI API documentation
572
575
  # https://platform.openai.com/docs/api-reference/embeddings/create
573
576
  input: EmbeddingInput
574
- model: str
577
+ model: str = DEFAULT_MODEL_NAME
575
578
  encoding_format: str = "float"
576
579
  dimensions: Optional[int] = None
577
580
  user: Optional[str] = None
@@ -605,7 +608,7 @@ class ScoringRequest(BaseModel):
605
608
  )
606
609
  apply_softmax: bool = False
607
610
  item_first: bool = False
608
- model: str
611
+ model: str = DEFAULT_MODEL_NAME
609
612
 
610
613
 
611
614
  class ScoringResponse(BaseModel):