sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
61
61
  from sglang.srt.distributed.parallel_state import destroy_distributed_environment
62
62
  from sglang.srt.entrypoints.engine import _set_envs_and_config
63
63
  from sglang.srt.hf_transformers_utils import get_tokenizer
64
+ from sglang.srt.layers.moe import initialize_moe_config
64
65
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
65
66
  from sglang.srt.managers.scheduler import Scheduler
66
67
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -509,6 +510,8 @@ def latency_test(
509
510
  bench_args,
510
511
  tp_rank,
511
512
  ):
513
+ initialize_moe_config(server_args)
514
+
512
515
  # Set CPU affinity
513
516
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
514
517
  set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
@@ -18,7 +18,7 @@ import json
18
18
  import multiprocessing
19
19
  import os
20
20
  import time
21
- from typing import Tuple
21
+ from typing import List, Tuple
22
22
 
23
23
  import requests
24
24
 
@@ -45,6 +45,7 @@ class BenchArgs:
45
45
  skip_warmup: bool = False
46
46
  show_report: bool = False
47
47
  profile: bool = False
48
+ profile_steps: int = 3
48
49
  profile_by_stage: bool = False
49
50
 
50
51
  @staticmethod
@@ -78,6 +79,9 @@ class BenchArgs:
78
79
  parser.add_argument("--skip-warmup", action="store_true")
79
80
  parser.add_argument("--show-report", action="store_true")
80
81
  parser.add_argument("--profile", action="store_true")
82
+ parser.add_argument(
83
+ "--profile-steps", type=int, default=BenchArgs.profile_steps
84
+ )
81
85
  parser.add_argument("--profile-by-stage", action="store_true")
82
86
 
83
87
  @classmethod
@@ -132,6 +136,7 @@ def run_one_case(
132
136
  result_filename: str,
133
137
  tokenizer,
134
138
  profile: bool = False,
139
+ profile_steps: int = 3,
135
140
  profile_by_stage: bool = False,
136
141
  ):
137
142
  requests.post(url + "/flush_cache")
@@ -162,7 +167,7 @@ def run_one_case(
162
167
  profile_link = None
163
168
  if profile:
164
169
  profile_link: str = run_profile(
165
- url, 3, ["CPU", "GPU"], None, None, profile_by_stage
170
+ url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
166
171
  )
167
172
 
168
173
  tic = time.perf_counter()
@@ -247,6 +252,71 @@ def run_one_case(
247
252
  )
248
253
 
249
254
 
255
+ def get_report_summary(
256
+ result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
257
+ ):
258
+ import tabulate
259
+
260
+ summary = (
261
+ f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
262
+ )
263
+
264
+ headers = [
265
+ "batch size",
266
+ "latency (s)",
267
+ "input throughput (tok/s)",
268
+ "output throughput (tok/s)",
269
+ "acc length",
270
+ "ITL (ms)",
271
+ "input cost ($/1M)",
272
+ "output cost ($/1M)",
273
+ ]
274
+ if bench_args.profile:
275
+ headers.append("profile")
276
+ rows = []
277
+
278
+ for (
279
+ batch_size,
280
+ latency,
281
+ ttft,
282
+ input_throughput,
283
+ output_throughput,
284
+ _,
285
+ _,
286
+ acc_length,
287
+ trace_link,
288
+ ) in result:
289
+ if is_blackwell():
290
+ hourly_cost_per_gpu = 4 # $4/hour for one B200
291
+ else:
292
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
293
+
294
+ hourly_cost = hourly_cost_per_gpu * server_args.tp_size
295
+ input_util = 0.7
296
+ accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
297
+ itl = 1 / (output_throughput / batch_size) * 1000
298
+ input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
299
+ output_cost = 1e6 / output_throughput / 3600 * hourly_cost
300
+ row = [
301
+ batch_size,
302
+ latency,
303
+ input_throughput,
304
+ output_throughput,
305
+ accept_length,
306
+ itl,
307
+ input_cost,
308
+ output_cost,
309
+ ]
310
+ if trace_link:
311
+ row.append(f"[Profile]({trace_link})")
312
+ rows.append(row)
313
+
314
+ summary += tabulate.tabulate(
315
+ rows, headers=headers, tablefmt="github", floatfmt=".2f"
316
+ )
317
+ return summary
318
+
319
+
250
320
  def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
251
321
  if bench_args.base_url:
252
322
  proc, base_url = None, bench_args.base_url
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
321
391
  result_filename=bench_args.result_filename,
322
392
  tokenizer=tokenizer,
323
393
  profile=bench_args.profile,
394
+ profile_steps=bench_args.profile_steps,
324
395
  profile_by_stage=bench_args.profile_by_stage,
325
396
  )[-1],
326
397
  )
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
337
408
  if not bench_args.show_report:
338
409
  return
339
410
 
340
- summary = (
341
- f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
342
- )
343
- summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
344
-
345
- if bench_args.profile:
346
- summary += " profile |"
347
-
348
- summary += "\n"
349
- summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
350
-
351
- if bench_args.profile:
352
- summary += "-------------|"
353
- summary += "\n"
354
-
355
- for (
356
- batch_size,
357
- latency,
358
- ttft,
359
- input_throughput,
360
- output_throughput,
361
- overall_throughput,
362
- last_gen_throughput,
363
- acc_length,
364
- trace_link,
365
- ) in result:
366
- if is_blackwell():
367
- hourly_cost_per_gpu = 4 # $4/hour for one B200
368
- else:
369
- hourly_cost_per_gpu = 2 # $2/hour for one H100
370
-
371
- hourly_cost = hourly_cost_per_gpu * server_args.tp_size
372
- input_util = 0.7
373
- accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
374
- line = (
375
- f"| {batch_size} | "
376
- f"{latency:.2f} | "
377
- f"{input_throughput:.2f} | "
378
- f"{output_throughput:.2f} | "
379
- f"{accept_length} | "
380
- f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
381
- f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
382
- f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
383
- )
384
- if trace_link:
385
- line += f" [Profile]({trace_link}) |"
386
- line += "\n"
387
- summary += line
388
-
389
- # print metrics table
411
+ summary = get_report_summary(result, server_args, bench_args)
390
412
  print(summary)
391
413
 
392
414
  if is_in_ci():
393
415
  write_github_step_summary(summary)
394
416
 
395
417
 
396
- if __name__ == "__main__":
418
+ def main():
397
419
  parser = argparse.ArgumentParser()
398
420
  ServerArgs.add_cli_args(parser)
399
421
  BenchArgs.add_cli_args(parser)
@@ -402,3 +424,7 @@ if __name__ == "__main__":
402
424
  bench_args = BenchArgs.from_cli_args(args)
403
425
 
404
426
  run_benchmark(server_args, bench_args)
427
+
428
+
429
+ if __name__ == "__main__":
430
+ main()
sglang/bench_serving.py CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
+ import base64
16
+ import io
15
17
  import json
16
18
  import os
17
19
  import pickle
@@ -71,7 +73,7 @@ class RequestFuncInput:
71
73
  output_len: int
72
74
  model: str
73
75
  lora_name: str
74
- image_data: str
76
+ image_data: Optional[List[str]]
75
77
  extra_request_body: Dict[str, Any]
76
78
 
77
79
 
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
289
291
  ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
290
292
 
291
293
  if request_func_input.image_data:
294
+ # Build multi-image content: a list of image_url entries followed by the text
295
+ content_items = [
296
+ {
297
+ "type": "image_url",
298
+ "image_url": {"url": img_url},
299
+ }
300
+ for img_url in request_func_input.image_data
301
+ ]
302
+ content_items.append({"type": "text", "text": request_func_input.prompt})
292
303
  messages = [
293
304
  {
294
305
  "role": "user",
295
- "content": [
296
- {
297
- "type": "image_url",
298
- "image_url": {"url": request_func_input.image_data},
299
- },
300
- {"type": "text", "text": request_func_input.prompt},
301
- ],
306
+ "content": content_items,
302
307
  },
303
308
  ]
304
309
  else:
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
497
502
  **request_func_input.extra_request_body,
498
503
  }
499
504
 
500
- # Add image data if available
505
+ # Add image data if available (list of image urls/base64)
501
506
  if request_func_input.image_data:
502
507
  payload["image_data"] = request_func_input.image_data
503
508
 
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
648
653
  prompt_suffix=args.prompt_suffix,
649
654
  apply_chat_template=args.apply_chat_template,
650
655
  )
651
- elif args.dataset_name.startswith("random"):
656
+ elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
652
657
  input_requests = sample_random_requests(
653
658
  input_len=args.random_input_len,
654
659
  output_len=args.random_output_len,
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
659
664
  random_sample=args.dataset_name == "random",
660
665
  return_text=not tokenize_prompt,
661
666
  )
667
+ elif args.dataset_name == "random-image":
668
+ assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
669
+ input_requests = sample_random_image_requests(
670
+ num_requests=args.num_prompts,
671
+ num_images=args.random_image_num_images,
672
+ input_len=args.random_input_len,
673
+ output_len=args.random_output_len,
674
+ range_ratio=args.random_range_ratio,
675
+ tokenizer=tokenizer,
676
+ apply_chat_template=args.apply_chat_template,
677
+ image_resolution=args.random_image_resolution,
678
+ )
662
679
  elif args.dataset_name == "generated-shared-prefix":
663
680
  assert not tokenize_prompt
664
681
  input_requests = sample_generated_shared_prefix_requests(
@@ -790,7 +807,7 @@ class DatasetRow:
790
807
  prompt: str
791
808
  prompt_len: int
792
809
  output_len: int
793
- image_data: Optional[str] = None
810
+ image_data: Optional[List[str]] = None
794
811
 
795
812
 
796
813
  def sample_mmmu_requests(
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
913
930
  prompt=prompt,
914
931
  prompt_len=prompt_len,
915
932
  output_len=output_len,
916
- image_data=image_data,
933
+ image_data=[image_data],
917
934
  )
918
935
  )
919
936
 
@@ -1113,6 +1130,132 @@ def sample_random_requests(
1113
1130
  return input_requests
1114
1131
 
1115
1132
 
1133
+ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1134
+ """Parse image resolution into (width, height).
1135
+
1136
+ Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
1137
+ (e.g., '1080x1920' means height=1080, width=1920).
1138
+ """
1139
+ resolution_to_size = {
1140
+ "4k": (3840, 2160),
1141
+ "1080p": (1920, 1080),
1142
+ "720p": (1280, 720),
1143
+ "360p": (640, 360),
1144
+ }
1145
+ if image_resolution in resolution_to_size:
1146
+ return resolution_to_size[image_resolution]
1147
+
1148
+ res = image_resolution.strip().lower()
1149
+ if "x" in res:
1150
+ parts = res.split("x")
1151
+ if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
1152
+ height = int(parts[0])
1153
+ width = int(parts[1])
1154
+ if height > 0 and width > 0:
1155
+ return (width, height)
1156
+
1157
+ raise ValueError(
1158
+ f"Unsupported random-image resolution: {image_resolution}. "
1159
+ "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1160
+ )
1161
+
1162
+
1163
+ def sample_random_image_requests(
1164
+ num_requests: int,
1165
+ num_images: int,
1166
+ input_len: int,
1167
+ output_len: int,
1168
+ range_ratio: float,
1169
+ tokenizer: PreTrainedTokenizerBase,
1170
+ apply_chat_template: bool = True,
1171
+ image_resolution: str = "1080p",
1172
+ ) -> List[DatasetRow]:
1173
+ """Generate requests with random images.
1174
+
1175
+ - Each request includes ``num_images`` random images.
1176
+ - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1177
+ or custom 'heightxwidth' (e.g., 1080x1920).
1178
+ - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1179
+ only counts text tokens and excludes image data.
1180
+ """
1181
+ try:
1182
+ import pybase64
1183
+ from PIL import Image
1184
+ except ImportError as e:
1185
+ raise ImportError(
1186
+ "Please install Pillow to generate random images: pip install pillow"
1187
+ ) from e
1188
+
1189
+ # Parse resolution (supports presets and 'heightxwidth')
1190
+ width, height = parse_random_image_resolution(image_resolution)
1191
+
1192
+ # Check for potentially problematic combinations and warn user
1193
+ if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1194
+ warnings.warn(
1195
+ f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1196
+ f"may take a long time. Consider reducing resolution or image count.",
1197
+ UserWarning,
1198
+ stacklevel=2,
1199
+ )
1200
+
1201
+ # Sample text lengths
1202
+ input_lens = np.random.randint(
1203
+ max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
1204
+ )
1205
+ output_lens = np.random.randint(
1206
+ int(output_len * range_ratio), output_len + 1, size=num_requests
1207
+ )
1208
+
1209
+ def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1210
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1211
+ img = Image.fromarray(arr, mode="RGB")
1212
+ buf = io.BytesIO()
1213
+ img.save(buf, format="JPEG", quality=85)
1214
+ encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1215
+ return f"data:image/jpeg;base64,{encoded}"
1216
+
1217
+ dataset: List[DatasetRow] = []
1218
+ for i in range(num_requests):
1219
+ # Generate text prompt
1220
+ text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1221
+
1222
+ # Generate image list
1223
+ images = [_gen_random_image_data_uri() for _ in range(num_images)]
1224
+
1225
+ prompt_str = text_prompt
1226
+ if apply_chat_template:
1227
+ try:
1228
+ content_items = [
1229
+ {"type": "image_url", "image_url": {"url": img_url}}
1230
+ for img_url in images
1231
+ ]
1232
+ content_items.append({"type": "text", "text": text_prompt})
1233
+ prompt_str = tokenizer.apply_chat_template(
1234
+ [{"role": "user", "content": content_items}],
1235
+ add_generation_prompt=True,
1236
+ tokenize=False,
1237
+ )
1238
+ except Exception:
1239
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1240
+ prompt_str = f"<image>{text_prompt}"
1241
+
1242
+ prompt_token_ids = tokenizer.encode(prompt_str)
1243
+ prompt_token_len = len(prompt_token_ids)
1244
+
1245
+ dataset.append(
1246
+ DatasetRow(
1247
+ prompt=prompt_str,
1248
+ prompt_len=prompt_token_len,
1249
+ output_len=int(output_lens[i]),
1250
+ image_data=images,
1251
+ )
1252
+ )
1253
+
1254
+ print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1255
+ print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1256
+ return dataset
1257
+
1258
+
1116
1259
  def gen_prompt(tokenizer, token_num):
1117
1260
  """Generate a random prompt of specified token length using tokenizer vocabulary."""
1118
1261
  all_available_tokens = list(tokenizer.get_vocab().values())
@@ -1579,7 +1722,13 @@ async def benchmark(
1579
1722
  output_file_name = args.output_file
1580
1723
  else:
1581
1724
  now = datetime.now().strftime("%m%d")
1582
- if args.dataset_name.startswith("random"):
1725
+ if args.dataset_name == "random-image":
1726
+ output_file_name = (
1727
+ f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1728
+ f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1729
+ f"{args.random_image_resolution}.jsonl"
1730
+ )
1731
+ elif args.dataset_name.startswith("random"):
1583
1732
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1584
1733
  else:
1585
1734
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
1819
1968
  "--dataset-name",
1820
1969
  type=str,
1821
1970
  default="sharegpt",
1822
- choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
1971
+ choices=[
1972
+ "sharegpt",
1973
+ "random",
1974
+ "random-ids",
1975
+ "generated-shared-prefix",
1976
+ "mmmu",
1977
+ "random-image",
1978
+ ],
1823
1979
  help="Name of the dataset to benchmark on.",
1824
1980
  )
1825
1981
  parser.add_argument(
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
1872
2028
  help="Range of sampled ratio of input/output length, "
1873
2029
  "used only for random dataset.",
1874
2030
  )
2031
+ # random-image dataset args
2032
+ parser.add_argument(
2033
+ "--random-image-num-images",
2034
+ type=int,
2035
+ default=1,
2036
+ help="Number of images per request (only available with the random-image dataset)",
2037
+ )
2038
+ parser.add_argument(
2039
+ "--random-image-resolution",
2040
+ type=str,
2041
+ default="1080p",
2042
+ help=(
2043
+ "Resolution of random images for random-image dataset. "
2044
+ "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2045
+ ),
2046
+ )
1875
2047
  parser.add_argument(
1876
2048
  "--request-rate",
1877
2049
  type=float,
sglang/profiler.py CHANGED
@@ -9,7 +9,6 @@ import argparse
9
9
  import json
10
10
  import os
11
11
  import time
12
- import urllib.parse
13
12
  from argparse import ArgumentParser
14
13
  from pathlib import Path
15
14
  from typing import List, Optional
@@ -5,6 +5,7 @@ from sglang.srt.configs.exaone import ExaoneConfig
5
5
  from sglang.srt.configs.janus_pro import MultiModalityConfig
6
6
  from sglang.srt.configs.kimi_vl import KimiVLConfig
7
7
  from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
8
+ from sglang.srt.configs.longcat_flash import LongcatFlashConfig
8
9
  from sglang.srt.configs.step3_vl import (
9
10
  Step3TextConfig,
10
11
  Step3VisionEncoderConfig,
@@ -16,6 +17,7 @@ __all__ = [
16
17
  "ChatGLMConfig",
17
18
  "DbrxConfig",
18
19
  "DeepseekVL2Config",
20
+ "LongcatFlashConfig",
19
21
  "MultiModalityConfig",
20
22
  "KimiVLConfig",
21
23
  "MoonViTConfig",
@@ -0,0 +1,104 @@
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ FLASH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
+
8
+
9
+ class LongcatFlashConfig(PretrainedConfig):
10
+ model_type = "longcat_flash"
11
+ keys_to_ignore_at_inference = ["past_key_values"]
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size=131072,
16
+ hidden_size=6144,
17
+ intermediate_size=None,
18
+ ffn_hidden_size=12288,
19
+ expert_ffn_hidden_size=2048,
20
+ num_layers=28,
21
+ num_hidden_layers=None,
22
+ num_attention_heads=64,
23
+ ep_size=1,
24
+ kv_lora_rank=512,
25
+ q_lora_rank=1536,
26
+ qk_rope_head_dim=128,
27
+ qk_nope_head_dim=128,
28
+ v_head_dim=128,
29
+ n_routed_experts=512,
30
+ moe_topk=12,
31
+ norm_topk_prob=False,
32
+ max_position_embeddings=131072,
33
+ rms_norm_eps=1e-05,
34
+ use_cache=True,
35
+ pad_token_id=None,
36
+ bos_token_id=1,
37
+ eos_token_id=2,
38
+ pretraining_tp=1,
39
+ tie_word_embeddings=False,
40
+ rope_theta=10000000.0,
41
+ rope_scaling=None,
42
+ attention_bias=False,
43
+ attention_dropout=0.0,
44
+ mla_scale_q_lora=True,
45
+ mla_scale_kv_lora=True,
46
+ torch_dtype="bfloat16",
47
+ params_dtype="bfloat16",
48
+ rounter_params_dtype="float32",
49
+ router_bias=False,
50
+ topk_method=None,
51
+ routed_scaling_factor=6.0,
52
+ zero_expert_num=256,
53
+ zero_expert_type="identity",
54
+ nextn_use_scmoe=False,
55
+ num_nextn_predict_layers=1,
56
+ **kwargs,
57
+ ):
58
+ super().__init__(
59
+ pad_token_id=pad_token_id,
60
+ bos_token_id=bos_token_id,
61
+ eos_token_id=eos_token_id,
62
+ tie_word_embeddings=tie_word_embeddings,
63
+ torch_dtype=torch_dtype,
64
+ params_dtype=params_dtype,
65
+ rounter_params_dtype=rounter_params_dtype,
66
+ topk_method=topk_method,
67
+ router_bias=router_bias,
68
+ nextn_use_scmoe=nextn_use_scmoe,
69
+ num_nextn_predict_layers=num_nextn_predict_layers,
70
+ **kwargs,
71
+ )
72
+ self.vocab_size = vocab_size
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.hidden_size = hidden_size
75
+ self.num_hidden_layers = (
76
+ num_hidden_layers if num_hidden_layers is not None else num_layers
77
+ )
78
+ self.intermediate_size = (
79
+ intermediate_size if intermediate_size is not None else ffn_hidden_size
80
+ )
81
+ self.moe_intermediate_size = expert_ffn_hidden_size
82
+ self.num_attention_heads = num_attention_heads
83
+ self.ep_size = ep_size
84
+ self.kv_lora_rank = kv_lora_rank
85
+ self.q_lora_rank = q_lora_rank
86
+ self.qk_rope_head_dim = qk_rope_head_dim
87
+ self.v_head_dim = v_head_dim
88
+ self.qk_nope_head_dim = qk_nope_head_dim
89
+ self.n_routed_experts = n_routed_experts
90
+ self.moe_topk = moe_topk
91
+ self.norm_topk_prob = norm_topk_prob
92
+ self.rms_norm_eps = rms_norm_eps
93
+ self.pretraining_tp = pretraining_tp
94
+ self.use_cache = use_cache
95
+ self.rope_theta = rope_theta
96
+ self.rope_scaling = rope_scaling
97
+ self.attention_bias = attention_bias
98
+ self.attention_dropout = attention_dropout
99
+ self.mla_scale_q_lora = mla_scale_q_lora
100
+ self.mla_scale_kv_lora = mla_scale_kv_lora
101
+ self.zero_expert_num = zero_expert_num
102
+ self.zero_expert_type = zero_expert_type
103
+ self.routed_scaling_factor = routed_scaling_factor
104
+ self.hidden_act = "silu"
@@ -132,6 +132,13 @@ class ModelConfig:
132
132
  if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
133
133
  self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
134
134
 
135
+ if (
136
+ is_draft_model
137
+ and self.hf_config.architectures[0] == "LongcatFlashForCausalLM"
138
+ ):
139
+ self.hf_config.architectures[0] = "LongcatFlashForCausalLMNextN"
140
+ self.hf_config.num_hidden_layers = self.hf_config.num_nextn_predict_layers
141
+
135
142
  if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
136
143
  self.hf_config.architectures[0] = "MiMoMTP"
137
144
  if (
@@ -199,6 +206,8 @@ class ModelConfig:
199
206
  "DeepseekV2ForCausalLM" in self.hf_config.architectures
200
207
  or "DeepseekV3ForCausalLM" in self.hf_config.architectures
201
208
  or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
209
+ or "LongcatFlashForCausalLM" in self.hf_config.architectures
210
+ or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
202
211
  ):
203
212
  self.head_dim = 256
204
213
  self.attention_arch = AttentionArch.MLA
@@ -270,6 +279,9 @@ class ModelConfig:
270
279
  self.num_key_value_heads = self.num_attention_heads
271
280
  self.hidden_size = self.hf_text_config.hidden_size
272
281
  self.num_hidden_layers = self.hf_text_config.num_hidden_layers
282
+ self.num_attention_layers = self.num_hidden_layers
283
+ if "LongcatFlashForCausalLM" in self.hf_config.architectures:
284
+ self.num_attention_layers = self.num_hidden_layers * 2
273
285
  self.num_nextn_predict_layers = getattr(
274
286
  self.hf_text_config, "num_nextn_predict_layers", None
275
287
  )
@@ -20,7 +20,7 @@ class ConnectorType(str, enum.Enum):
20
20
  KV = "KV"
21
21
 
22
22
 
23
- def create_remote_connector(url, device="cpu") -> BaseConnector:
23
+ def create_remote_connector(url, **kwargs) -> BaseConnector:
24
24
  connector_type = parse_connector_type(url)
25
25
  if connector_type == "redis":
26
26
  return RedisConnector(url)
@@ -20,9 +20,8 @@ class BaseConnector(ABC):
20
20
  <connector_type://<host>:<port>/<model_name>/files/<filename>
21
21
  """
22
22
 
23
- def __init__(self, url: str, device: torch.device = "cpu"):
23
+ def __init__(self, url: str):
24
24
  self.url = url
25
- self.device = device
26
25
  self.closed = False
27
26
  self.local_dir = tempfile.mkdtemp()
28
27
  for sig in (signal.SIGINT, signal.SIGTERM):
@@ -15,10 +15,10 @@ logger = logging.getLogger(__name__)
15
15
 
16
16
  class RedisConnector(BaseKVConnector):
17
17
 
18
- def __init__(self, url: str, device: torch.device = "cpu"):
18
+ def __init__(self, url: str):
19
19
  import redis
20
20
 
21
- super().__init__(url, device)
21
+ super().__init__(url)
22
22
  parsed_url = urlparse(url)
23
23
  self.connection = redis.Redis(host=parsed_url.hostname, port=parsed_url.port)
24
24
  self.model_name = parsed_url.path.lstrip("/")