sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +87 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +26 -7
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +374 -136
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +25 -27
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +237 -204
- sglang/srt/managers/detokenizer_manager.py +48 -2
- sglang/srt/managers/io_struct.py +57 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +94 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +122 -42
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +51 -23
- sglang/srt/mem_cache/hiradix_cache.py +87 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +77 -14
- sglang/srt/mem_cache/memory_pool_host.py +4 -5
- sglang/srt/mem_cache/radix_cache.py +6 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -5
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +38 -13
- sglang/srt/models/gpt_oss.py +2 -15
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +66 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +122 -56
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +73 -5
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
|
|
61
61
|
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
|
62
62
|
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
63
63
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
64
|
+
from sglang.srt.layers.moe import initialize_moe_config
|
64
65
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
65
66
|
from sglang.srt.managers.scheduler import Scheduler
|
66
67
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
@@ -509,6 +510,8 @@ def latency_test(
|
|
509
510
|
bench_args,
|
510
511
|
tp_rank,
|
511
512
|
):
|
513
|
+
initialize_moe_config(server_args)
|
514
|
+
|
512
515
|
# Set CPU affinity
|
513
516
|
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
514
517
|
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
|
sglang/bench_one_batch_server.py
CHANGED
@@ -18,7 +18,7 @@ import json
|
|
18
18
|
import multiprocessing
|
19
19
|
import os
|
20
20
|
import time
|
21
|
-
from typing import Tuple
|
21
|
+
from typing import List, Tuple
|
22
22
|
|
23
23
|
import requests
|
24
24
|
|
@@ -45,6 +45,7 @@ class BenchArgs:
|
|
45
45
|
skip_warmup: bool = False
|
46
46
|
show_report: bool = False
|
47
47
|
profile: bool = False
|
48
|
+
profile_steps: int = 3
|
48
49
|
profile_by_stage: bool = False
|
49
50
|
|
50
51
|
@staticmethod
|
@@ -78,6 +79,9 @@ class BenchArgs:
|
|
78
79
|
parser.add_argument("--skip-warmup", action="store_true")
|
79
80
|
parser.add_argument("--show-report", action="store_true")
|
80
81
|
parser.add_argument("--profile", action="store_true")
|
82
|
+
parser.add_argument(
|
83
|
+
"--profile-steps", type=int, default=BenchArgs.profile_steps
|
84
|
+
)
|
81
85
|
parser.add_argument("--profile-by-stage", action="store_true")
|
82
86
|
|
83
87
|
@classmethod
|
@@ -132,6 +136,7 @@ def run_one_case(
|
|
132
136
|
result_filename: str,
|
133
137
|
tokenizer,
|
134
138
|
profile: bool = False,
|
139
|
+
profile_steps: int = 3,
|
135
140
|
profile_by_stage: bool = False,
|
136
141
|
):
|
137
142
|
requests.post(url + "/flush_cache")
|
@@ -162,7 +167,7 @@ def run_one_case(
|
|
162
167
|
profile_link = None
|
163
168
|
if profile:
|
164
169
|
profile_link: str = run_profile(
|
165
|
-
url,
|
170
|
+
url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
|
166
171
|
)
|
167
172
|
|
168
173
|
tic = time.perf_counter()
|
@@ -247,6 +252,71 @@ def run_one_case(
|
|
247
252
|
)
|
248
253
|
|
249
254
|
|
255
|
+
def get_report_summary(
|
256
|
+
result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
|
257
|
+
):
|
258
|
+
import tabulate
|
259
|
+
|
260
|
+
summary = (
|
261
|
+
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
262
|
+
)
|
263
|
+
|
264
|
+
headers = [
|
265
|
+
"batch size",
|
266
|
+
"latency (s)",
|
267
|
+
"input throughput (tok/s)",
|
268
|
+
"output throughput (tok/s)",
|
269
|
+
"acc length",
|
270
|
+
"ITL (ms)",
|
271
|
+
"input cost ($/1M)",
|
272
|
+
"output cost ($/1M)",
|
273
|
+
]
|
274
|
+
if bench_args.profile:
|
275
|
+
headers.append("profile")
|
276
|
+
rows = []
|
277
|
+
|
278
|
+
for (
|
279
|
+
batch_size,
|
280
|
+
latency,
|
281
|
+
ttft,
|
282
|
+
input_throughput,
|
283
|
+
output_throughput,
|
284
|
+
_,
|
285
|
+
_,
|
286
|
+
acc_length,
|
287
|
+
trace_link,
|
288
|
+
) in result:
|
289
|
+
if is_blackwell():
|
290
|
+
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
291
|
+
else:
|
292
|
+
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
293
|
+
|
294
|
+
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
295
|
+
input_util = 0.7
|
296
|
+
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
297
|
+
itl = 1 / (output_throughput / batch_size) * 1000
|
298
|
+
input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
|
299
|
+
output_cost = 1e6 / output_throughput / 3600 * hourly_cost
|
300
|
+
row = [
|
301
|
+
batch_size,
|
302
|
+
latency,
|
303
|
+
input_throughput,
|
304
|
+
output_throughput,
|
305
|
+
accept_length,
|
306
|
+
itl,
|
307
|
+
input_cost,
|
308
|
+
output_cost,
|
309
|
+
]
|
310
|
+
if trace_link:
|
311
|
+
row.append(f"[Profile]({trace_link})")
|
312
|
+
rows.append(row)
|
313
|
+
|
314
|
+
summary += tabulate.tabulate(
|
315
|
+
rows, headers=headers, tablefmt="github", floatfmt=".2f"
|
316
|
+
)
|
317
|
+
return summary
|
318
|
+
|
319
|
+
|
250
320
|
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
251
321
|
if bench_args.base_url:
|
252
322
|
proc, base_url = None, bench_args.base_url
|
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
321
391
|
result_filename=bench_args.result_filename,
|
322
392
|
tokenizer=tokenizer,
|
323
393
|
profile=bench_args.profile,
|
394
|
+
profile_steps=bench_args.profile_steps,
|
324
395
|
profile_by_stage=bench_args.profile_by_stage,
|
325
396
|
)[-1],
|
326
397
|
)
|
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
337
408
|
if not bench_args.show_report:
|
338
409
|
return
|
339
410
|
|
340
|
-
summary = (
|
341
|
-
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
342
|
-
)
|
343
|
-
summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
|
344
|
-
|
345
|
-
if bench_args.profile:
|
346
|
-
summary += " profile |"
|
347
|
-
|
348
|
-
summary += "\n"
|
349
|
-
summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
|
350
|
-
|
351
|
-
if bench_args.profile:
|
352
|
-
summary += "-------------|"
|
353
|
-
summary += "\n"
|
354
|
-
|
355
|
-
for (
|
356
|
-
batch_size,
|
357
|
-
latency,
|
358
|
-
ttft,
|
359
|
-
input_throughput,
|
360
|
-
output_throughput,
|
361
|
-
overall_throughput,
|
362
|
-
last_gen_throughput,
|
363
|
-
acc_length,
|
364
|
-
trace_link,
|
365
|
-
) in result:
|
366
|
-
if is_blackwell():
|
367
|
-
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
368
|
-
else:
|
369
|
-
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
370
|
-
|
371
|
-
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
372
|
-
input_util = 0.7
|
373
|
-
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
374
|
-
line = (
|
375
|
-
f"| {batch_size} | "
|
376
|
-
f"{latency:.2f} | "
|
377
|
-
f"{input_throughput:.2f} | "
|
378
|
-
f"{output_throughput:.2f} | "
|
379
|
-
f"{accept_length} | "
|
380
|
-
f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
|
381
|
-
f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
|
382
|
-
f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
|
383
|
-
)
|
384
|
-
if trace_link:
|
385
|
-
line += f" [Profile]({trace_link}) |"
|
386
|
-
line += "\n"
|
387
|
-
summary += line
|
388
|
-
|
389
|
-
# print metrics table
|
411
|
+
summary = get_report_summary(result, server_args, bench_args)
|
390
412
|
print(summary)
|
391
413
|
|
392
414
|
if is_in_ci():
|
393
415
|
write_github_step_summary(summary)
|
394
416
|
|
395
417
|
|
396
|
-
|
418
|
+
def main():
|
397
419
|
parser = argparse.ArgumentParser()
|
398
420
|
ServerArgs.add_cli_args(parser)
|
399
421
|
BenchArgs.add_cli_args(parser)
|
@@ -402,3 +424,7 @@ if __name__ == "__main__":
|
|
402
424
|
bench_args = BenchArgs.from_cli_args(args)
|
403
425
|
|
404
426
|
run_benchmark(server_args, bench_args)
|
427
|
+
|
428
|
+
|
429
|
+
if __name__ == "__main__":
|
430
|
+
main()
|
sglang/bench_serving.py
CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
|
12
12
|
|
13
13
|
import argparse
|
14
14
|
import asyncio
|
15
|
+
import base64
|
16
|
+
import io
|
15
17
|
import json
|
16
18
|
import os
|
17
19
|
import pickle
|
@@ -71,7 +73,7 @@ class RequestFuncInput:
|
|
71
73
|
output_len: int
|
72
74
|
model: str
|
73
75
|
lora_name: str
|
74
|
-
image_data: str
|
76
|
+
image_data: Optional[List[str]]
|
75
77
|
extra_request_body: Dict[str, Any]
|
76
78
|
|
77
79
|
|
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
|
|
289
291
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
290
292
|
|
291
293
|
if request_func_input.image_data:
|
294
|
+
# Build multi-image content: a list of image_url entries followed by the text
|
295
|
+
content_items = [
|
296
|
+
{
|
297
|
+
"type": "image_url",
|
298
|
+
"image_url": {"url": img_url},
|
299
|
+
}
|
300
|
+
for img_url in request_func_input.image_data
|
301
|
+
]
|
302
|
+
content_items.append({"type": "text", "text": request_func_input.prompt})
|
292
303
|
messages = [
|
293
304
|
{
|
294
305
|
"role": "user",
|
295
|
-
"content":
|
296
|
-
{
|
297
|
-
"type": "image_url",
|
298
|
-
"image_url": {"url": request_func_input.image_data},
|
299
|
-
},
|
300
|
-
{"type": "text", "text": request_func_input.prompt},
|
301
|
-
],
|
306
|
+
"content": content_items,
|
302
307
|
},
|
303
308
|
]
|
304
309
|
else:
|
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
|
|
497
502
|
**request_func_input.extra_request_body,
|
498
503
|
}
|
499
504
|
|
500
|
-
# Add image data if available
|
505
|
+
# Add image data if available (list of image urls/base64)
|
501
506
|
if request_func_input.image_data:
|
502
507
|
payload["image_data"] = request_func_input.image_data
|
503
508
|
|
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
|
|
648
653
|
prompt_suffix=args.prompt_suffix,
|
649
654
|
apply_chat_template=args.apply_chat_template,
|
650
655
|
)
|
651
|
-
elif args.dataset_name.startswith("random"):
|
656
|
+
elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
|
652
657
|
input_requests = sample_random_requests(
|
653
658
|
input_len=args.random_input_len,
|
654
659
|
output_len=args.random_output_len,
|
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
|
|
659
664
|
random_sample=args.dataset_name == "random",
|
660
665
|
return_text=not tokenize_prompt,
|
661
666
|
)
|
667
|
+
elif args.dataset_name == "random-image":
|
668
|
+
assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
|
669
|
+
input_requests = sample_random_image_requests(
|
670
|
+
num_requests=args.num_prompts,
|
671
|
+
num_images=args.random_image_num_images,
|
672
|
+
input_len=args.random_input_len,
|
673
|
+
output_len=args.random_output_len,
|
674
|
+
range_ratio=args.random_range_ratio,
|
675
|
+
tokenizer=tokenizer,
|
676
|
+
apply_chat_template=args.apply_chat_template,
|
677
|
+
image_resolution=args.random_image_resolution,
|
678
|
+
)
|
662
679
|
elif args.dataset_name == "generated-shared-prefix":
|
663
680
|
assert not tokenize_prompt
|
664
681
|
input_requests = sample_generated_shared_prefix_requests(
|
@@ -790,7 +807,7 @@ class DatasetRow:
|
|
790
807
|
prompt: str
|
791
808
|
prompt_len: int
|
792
809
|
output_len: int
|
793
|
-
image_data: Optional[str] = None
|
810
|
+
image_data: Optional[List[str]] = None
|
794
811
|
|
795
812
|
|
796
813
|
def sample_mmmu_requests(
|
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
|
|
913
930
|
prompt=prompt,
|
914
931
|
prompt_len=prompt_len,
|
915
932
|
output_len=output_len,
|
916
|
-
image_data=image_data,
|
933
|
+
image_data=[image_data],
|
917
934
|
)
|
918
935
|
)
|
919
936
|
|
@@ -1113,6 +1130,132 @@ def sample_random_requests(
|
|
1113
1130
|
return input_requests
|
1114
1131
|
|
1115
1132
|
|
1133
|
+
def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1134
|
+
"""Parse image resolution into (width, height).
|
1135
|
+
|
1136
|
+
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
1137
|
+
(e.g., '1080x1920' means height=1080, width=1920).
|
1138
|
+
"""
|
1139
|
+
resolution_to_size = {
|
1140
|
+
"4k": (3840, 2160),
|
1141
|
+
"1080p": (1920, 1080),
|
1142
|
+
"720p": (1280, 720),
|
1143
|
+
"360p": (640, 360),
|
1144
|
+
}
|
1145
|
+
if image_resolution in resolution_to_size:
|
1146
|
+
return resolution_to_size[image_resolution]
|
1147
|
+
|
1148
|
+
res = image_resolution.strip().lower()
|
1149
|
+
if "x" in res:
|
1150
|
+
parts = res.split("x")
|
1151
|
+
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
1152
|
+
height = int(parts[0])
|
1153
|
+
width = int(parts[1])
|
1154
|
+
if height > 0 and width > 0:
|
1155
|
+
return (width, height)
|
1156
|
+
|
1157
|
+
raise ValueError(
|
1158
|
+
f"Unsupported random-image resolution: {image_resolution}. "
|
1159
|
+
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
|
1163
|
+
def sample_random_image_requests(
|
1164
|
+
num_requests: int,
|
1165
|
+
num_images: int,
|
1166
|
+
input_len: int,
|
1167
|
+
output_len: int,
|
1168
|
+
range_ratio: float,
|
1169
|
+
tokenizer: PreTrainedTokenizerBase,
|
1170
|
+
apply_chat_template: bool = True,
|
1171
|
+
image_resolution: str = "1080p",
|
1172
|
+
) -> List[DatasetRow]:
|
1173
|
+
"""Generate requests with random images.
|
1174
|
+
|
1175
|
+
- Each request includes ``num_images`` random images.
|
1176
|
+
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1177
|
+
or custom 'heightxwidth' (e.g., 1080x1920).
|
1178
|
+
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
1179
|
+
only counts text tokens and excludes image data.
|
1180
|
+
"""
|
1181
|
+
try:
|
1182
|
+
import pybase64
|
1183
|
+
from PIL import Image
|
1184
|
+
except ImportError as e:
|
1185
|
+
raise ImportError(
|
1186
|
+
"Please install Pillow to generate random images: pip install pillow"
|
1187
|
+
) from e
|
1188
|
+
|
1189
|
+
# Parse resolution (supports presets and 'heightxwidth')
|
1190
|
+
width, height = parse_random_image_resolution(image_resolution)
|
1191
|
+
|
1192
|
+
# Check for potentially problematic combinations and warn user
|
1193
|
+
if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
|
1194
|
+
warnings.warn(
|
1195
|
+
f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
|
1196
|
+
f"may take a long time. Consider reducing resolution or image count.",
|
1197
|
+
UserWarning,
|
1198
|
+
stacklevel=2,
|
1199
|
+
)
|
1200
|
+
|
1201
|
+
# Sample text lengths
|
1202
|
+
input_lens = np.random.randint(
|
1203
|
+
max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
|
1204
|
+
)
|
1205
|
+
output_lens = np.random.randint(
|
1206
|
+
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1207
|
+
)
|
1208
|
+
|
1209
|
+
def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
|
1210
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1211
|
+
img = Image.fromarray(arr, mode="RGB")
|
1212
|
+
buf = io.BytesIO()
|
1213
|
+
img.save(buf, format="JPEG", quality=85)
|
1214
|
+
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1215
|
+
return f"data:image/jpeg;base64,{encoded}"
|
1216
|
+
|
1217
|
+
dataset: List[DatasetRow] = []
|
1218
|
+
for i in range(num_requests):
|
1219
|
+
# Generate text prompt
|
1220
|
+
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1221
|
+
|
1222
|
+
# Generate image list
|
1223
|
+
images = [_gen_random_image_data_uri() for _ in range(num_images)]
|
1224
|
+
|
1225
|
+
prompt_str = text_prompt
|
1226
|
+
if apply_chat_template:
|
1227
|
+
try:
|
1228
|
+
content_items = [
|
1229
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1230
|
+
for img_url in images
|
1231
|
+
]
|
1232
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1233
|
+
prompt_str = tokenizer.apply_chat_template(
|
1234
|
+
[{"role": "user", "content": content_items}],
|
1235
|
+
add_generation_prompt=True,
|
1236
|
+
tokenize=False,
|
1237
|
+
)
|
1238
|
+
except Exception:
|
1239
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1240
|
+
prompt_str = f"<image>{text_prompt}"
|
1241
|
+
|
1242
|
+
prompt_token_ids = tokenizer.encode(prompt_str)
|
1243
|
+
prompt_token_len = len(prompt_token_ids)
|
1244
|
+
|
1245
|
+
dataset.append(
|
1246
|
+
DatasetRow(
|
1247
|
+
prompt=prompt_str,
|
1248
|
+
prompt_len=prompt_token_len,
|
1249
|
+
output_len=int(output_lens[i]),
|
1250
|
+
image_data=images,
|
1251
|
+
)
|
1252
|
+
)
|
1253
|
+
|
1254
|
+
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1255
|
+
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1256
|
+
return dataset
|
1257
|
+
|
1258
|
+
|
1116
1259
|
def gen_prompt(tokenizer, token_num):
|
1117
1260
|
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
1118
1261
|
all_available_tokens = list(tokenizer.get_vocab().values())
|
@@ -1579,7 +1722,13 @@ async def benchmark(
|
|
1579
1722
|
output_file_name = args.output_file
|
1580
1723
|
else:
|
1581
1724
|
now = datetime.now().strftime("%m%d")
|
1582
|
-
if args.dataset_name
|
1725
|
+
if args.dataset_name == "random-image":
|
1726
|
+
output_file_name = (
|
1727
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1728
|
+
f"{args.random_output_len}_{args.random_image_num_images}imgs_"
|
1729
|
+
f"{args.random_image_resolution}.jsonl"
|
1730
|
+
)
|
1731
|
+
elif args.dataset_name.startswith("random"):
|
1583
1732
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1584
1733
|
else:
|
1585
1734
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
|
|
1819
1968
|
"--dataset-name",
|
1820
1969
|
type=str,
|
1821
1970
|
default="sharegpt",
|
1822
|
-
choices=[
|
1971
|
+
choices=[
|
1972
|
+
"sharegpt",
|
1973
|
+
"random",
|
1974
|
+
"random-ids",
|
1975
|
+
"generated-shared-prefix",
|
1976
|
+
"mmmu",
|
1977
|
+
"random-image",
|
1978
|
+
],
|
1823
1979
|
help="Name of the dataset to benchmark on.",
|
1824
1980
|
)
|
1825
1981
|
parser.add_argument(
|
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
|
|
1872
2028
|
help="Range of sampled ratio of input/output length, "
|
1873
2029
|
"used only for random dataset.",
|
1874
2030
|
)
|
2031
|
+
# random-image dataset args
|
2032
|
+
parser.add_argument(
|
2033
|
+
"--random-image-num-images",
|
2034
|
+
type=int,
|
2035
|
+
default=1,
|
2036
|
+
help="Number of images per request (only available with the random-image dataset)",
|
2037
|
+
)
|
2038
|
+
parser.add_argument(
|
2039
|
+
"--random-image-resolution",
|
2040
|
+
type=str,
|
2041
|
+
default="1080p",
|
2042
|
+
help=(
|
2043
|
+
"Resolution of random images for random-image dataset. "
|
2044
|
+
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2045
|
+
),
|
2046
|
+
)
|
1875
2047
|
parser.add_argument(
|
1876
2048
|
"--request-rate",
|
1877
2049
|
type=float,
|
sglang/profiler.py
CHANGED
sglang/srt/configs/__init__.py
CHANGED
@@ -5,6 +5,7 @@ from sglang.srt.configs.exaone import ExaoneConfig
|
|
5
5
|
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
6
6
|
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
7
7
|
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
8
|
+
from sglang.srt.configs.longcat_flash import LongcatFlashConfig
|
8
9
|
from sglang.srt.configs.step3_vl import (
|
9
10
|
Step3TextConfig,
|
10
11
|
Step3VisionEncoderConfig,
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
16
17
|
"ChatGLMConfig",
|
17
18
|
"DbrxConfig",
|
18
19
|
"DeepseekVL2Config",
|
20
|
+
"LongcatFlashConfig",
|
19
21
|
"MultiModalityConfig",
|
20
22
|
"KimiVLConfig",
|
21
23
|
"MoonViTConfig",
|
@@ -0,0 +1,104 @@
|
|
1
|
+
from transformers.configuration_utils import PretrainedConfig
|
2
|
+
from transformers.utils import logging
|
3
|
+
|
4
|
+
logger = logging.get_logger(__name__)
|
5
|
+
|
6
|
+
FLASH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
7
|
+
|
8
|
+
|
9
|
+
class LongcatFlashConfig(PretrainedConfig):
|
10
|
+
model_type = "longcat_flash"
|
11
|
+
keys_to_ignore_at_inference = ["past_key_values"]
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
vocab_size=131072,
|
16
|
+
hidden_size=6144,
|
17
|
+
intermediate_size=None,
|
18
|
+
ffn_hidden_size=12288,
|
19
|
+
expert_ffn_hidden_size=2048,
|
20
|
+
num_layers=28,
|
21
|
+
num_hidden_layers=None,
|
22
|
+
num_attention_heads=64,
|
23
|
+
ep_size=1,
|
24
|
+
kv_lora_rank=512,
|
25
|
+
q_lora_rank=1536,
|
26
|
+
qk_rope_head_dim=128,
|
27
|
+
qk_nope_head_dim=128,
|
28
|
+
v_head_dim=128,
|
29
|
+
n_routed_experts=512,
|
30
|
+
moe_topk=12,
|
31
|
+
norm_topk_prob=False,
|
32
|
+
max_position_embeddings=131072,
|
33
|
+
rms_norm_eps=1e-05,
|
34
|
+
use_cache=True,
|
35
|
+
pad_token_id=None,
|
36
|
+
bos_token_id=1,
|
37
|
+
eos_token_id=2,
|
38
|
+
pretraining_tp=1,
|
39
|
+
tie_word_embeddings=False,
|
40
|
+
rope_theta=10000000.0,
|
41
|
+
rope_scaling=None,
|
42
|
+
attention_bias=False,
|
43
|
+
attention_dropout=0.0,
|
44
|
+
mla_scale_q_lora=True,
|
45
|
+
mla_scale_kv_lora=True,
|
46
|
+
torch_dtype="bfloat16",
|
47
|
+
params_dtype="bfloat16",
|
48
|
+
rounter_params_dtype="float32",
|
49
|
+
router_bias=False,
|
50
|
+
topk_method=None,
|
51
|
+
routed_scaling_factor=6.0,
|
52
|
+
zero_expert_num=256,
|
53
|
+
zero_expert_type="identity",
|
54
|
+
nextn_use_scmoe=False,
|
55
|
+
num_nextn_predict_layers=1,
|
56
|
+
**kwargs,
|
57
|
+
):
|
58
|
+
super().__init__(
|
59
|
+
pad_token_id=pad_token_id,
|
60
|
+
bos_token_id=bos_token_id,
|
61
|
+
eos_token_id=eos_token_id,
|
62
|
+
tie_word_embeddings=tie_word_embeddings,
|
63
|
+
torch_dtype=torch_dtype,
|
64
|
+
params_dtype=params_dtype,
|
65
|
+
rounter_params_dtype=rounter_params_dtype,
|
66
|
+
topk_method=topk_method,
|
67
|
+
router_bias=router_bias,
|
68
|
+
nextn_use_scmoe=nextn_use_scmoe,
|
69
|
+
num_nextn_predict_layers=num_nextn_predict_layers,
|
70
|
+
**kwargs,
|
71
|
+
)
|
72
|
+
self.vocab_size = vocab_size
|
73
|
+
self.max_position_embeddings = max_position_embeddings
|
74
|
+
self.hidden_size = hidden_size
|
75
|
+
self.num_hidden_layers = (
|
76
|
+
num_hidden_layers if num_hidden_layers is not None else num_layers
|
77
|
+
)
|
78
|
+
self.intermediate_size = (
|
79
|
+
intermediate_size if intermediate_size is not None else ffn_hidden_size
|
80
|
+
)
|
81
|
+
self.moe_intermediate_size = expert_ffn_hidden_size
|
82
|
+
self.num_attention_heads = num_attention_heads
|
83
|
+
self.ep_size = ep_size
|
84
|
+
self.kv_lora_rank = kv_lora_rank
|
85
|
+
self.q_lora_rank = q_lora_rank
|
86
|
+
self.qk_rope_head_dim = qk_rope_head_dim
|
87
|
+
self.v_head_dim = v_head_dim
|
88
|
+
self.qk_nope_head_dim = qk_nope_head_dim
|
89
|
+
self.n_routed_experts = n_routed_experts
|
90
|
+
self.moe_topk = moe_topk
|
91
|
+
self.norm_topk_prob = norm_topk_prob
|
92
|
+
self.rms_norm_eps = rms_norm_eps
|
93
|
+
self.pretraining_tp = pretraining_tp
|
94
|
+
self.use_cache = use_cache
|
95
|
+
self.rope_theta = rope_theta
|
96
|
+
self.rope_scaling = rope_scaling
|
97
|
+
self.attention_bias = attention_bias
|
98
|
+
self.attention_dropout = attention_dropout
|
99
|
+
self.mla_scale_q_lora = mla_scale_q_lora
|
100
|
+
self.mla_scale_kv_lora = mla_scale_kv_lora
|
101
|
+
self.zero_expert_num = zero_expert_num
|
102
|
+
self.zero_expert_type = zero_expert_type
|
103
|
+
self.routed_scaling_factor = routed_scaling_factor
|
104
|
+
self.hidden_act = "silu"
|
@@ -132,6 +132,13 @@ class ModelConfig:
|
|
132
132
|
if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
|
133
133
|
self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
|
134
134
|
|
135
|
+
if (
|
136
|
+
is_draft_model
|
137
|
+
and self.hf_config.architectures[0] == "LongcatFlashForCausalLM"
|
138
|
+
):
|
139
|
+
self.hf_config.architectures[0] = "LongcatFlashForCausalLMNextN"
|
140
|
+
self.hf_config.num_hidden_layers = self.hf_config.num_nextn_predict_layers
|
141
|
+
|
135
142
|
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
136
143
|
self.hf_config.architectures[0] = "MiMoMTP"
|
137
144
|
if (
|
@@ -199,6 +206,8 @@ class ModelConfig:
|
|
199
206
|
"DeepseekV2ForCausalLM" in self.hf_config.architectures
|
200
207
|
or "DeepseekV3ForCausalLM" in self.hf_config.architectures
|
201
208
|
or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
|
209
|
+
or "LongcatFlashForCausalLM" in self.hf_config.architectures
|
210
|
+
or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
|
202
211
|
):
|
203
212
|
self.head_dim = 256
|
204
213
|
self.attention_arch = AttentionArch.MLA
|
@@ -270,6 +279,9 @@ class ModelConfig:
|
|
270
279
|
self.num_key_value_heads = self.num_attention_heads
|
271
280
|
self.hidden_size = self.hf_text_config.hidden_size
|
272
281
|
self.num_hidden_layers = self.hf_text_config.num_hidden_layers
|
282
|
+
self.num_attention_layers = self.num_hidden_layers
|
283
|
+
if "LongcatFlashForCausalLM" in self.hf_config.architectures:
|
284
|
+
self.num_attention_layers = self.num_hidden_layers * 2
|
273
285
|
self.num_nextn_predict_layers = getattr(
|
274
286
|
self.hf_text_config, "num_nextn_predict_layers", None
|
275
287
|
)
|
sglang/srt/connector/__init__.py
CHANGED
@@ -20,7 +20,7 @@ class ConnectorType(str, enum.Enum):
|
|
20
20
|
KV = "KV"
|
21
21
|
|
22
22
|
|
23
|
-
def create_remote_connector(url,
|
23
|
+
def create_remote_connector(url, **kwargs) -> BaseConnector:
|
24
24
|
connector_type = parse_connector_type(url)
|
25
25
|
if connector_type == "redis":
|
26
26
|
return RedisConnector(url)
|
@@ -20,9 +20,8 @@ class BaseConnector(ABC):
|
|
20
20
|
<connector_type://<host>:<port>/<model_name>/files/<filename>
|
21
21
|
"""
|
22
22
|
|
23
|
-
def __init__(self, url: str
|
23
|
+
def __init__(self, url: str):
|
24
24
|
self.url = url
|
25
|
-
self.device = device
|
26
25
|
self.closed = False
|
27
26
|
self.local_dir = tempfile.mkdtemp()
|
28
27
|
for sig in (signal.SIGINT, signal.SIGTERM):
|
sglang/srt/connector/redis.py
CHANGED
@@ -15,10 +15,10 @@ logger = logging.getLogger(__name__)
|
|
15
15
|
|
16
16
|
class RedisConnector(BaseKVConnector):
|
17
17
|
|
18
|
-
def __init__(self, url: str
|
18
|
+
def __init__(self, url: str):
|
19
19
|
import redis
|
20
20
|
|
21
|
-
super().__init__(url
|
21
|
+
super().__init__(url)
|
22
22
|
parsed_url = urlparse(url)
|
23
23
|
self.connection = redis.Redis(host=parsed_url.hostname, port=parsed_url.port)
|
24
24
|
self.model_name = parsed_url.path.lstrip("/")
|