sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +1 -11
- sglang/bench_serving.py +149 -1
- sglang/lang/chat_template.py +44 -0
- sglang/srt/configs/deepseekvl2.py +3 -0
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +17 -0
- sglang/srt/constrained/xgrammar_backend.py +11 -19
- sglang/srt/conversation.py +30 -3
- sglang/srt/disaggregation/decode.py +4 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +9 -18
- sglang/srt/disaggregation/nixl/conn.py +241 -71
- sglang/srt/disaggregation/utils.py +44 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +14 -2
- sglang/srt/entrypoints/http_server.py +28 -1
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/hf_transformers_utils.py +20 -1
- sglang/srt/layers/attention/flashattention_backend.py +146 -50
- sglang/srt/layers/attention/flashinfer_backend.py +23 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
- sglang/srt/layers/moe/ep_moe/layer.py +120 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +5 -0
- sglang/srt/layers/quantization/fp8.py +108 -95
- sglang/srt/layers/quantization/fp8_kernel.py +79 -60
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/lora/lora_manager.py +10 -13
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/schedule_batch.py +19 -1
- sglang/srt/managers/schedule_policy.py +11 -5
- sglang/srt/managers/scheduler.py +28 -13
- sglang/srt/managers/tokenizer_manager.py +24 -13
- sglang/srt/managers/tp_worker.py +9 -12
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +2 -2
- sglang/srt/model_executor/model_runner.py +44 -33
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_nextn.py +1 -20
- sglang/srt/models/deepseek_v2.py +55 -20
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +53 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +24 -40
- sglang/srt/openai_api/protocol.py +28 -16
- sglang/srt/reasoning_parser.py +2 -2
- sglang/srt/sampling/sampling_batch_info.py +54 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +30 -6
- sglang/srt/utils.py +35 -1
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_utils.py +3 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
@@ -137,17 +137,7 @@ def load_model(server_args, port_args, tp_rank):
|
|
137
137
|
suppress_other_loggers()
|
138
138
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
139
139
|
|
140
|
-
model_config = ModelConfig(
|
141
|
-
server_args.model_path,
|
142
|
-
trust_remote_code=server_args.trust_remote_code,
|
143
|
-
revision=server_args.revision,
|
144
|
-
context_length=server_args.context_length,
|
145
|
-
model_override_args=server_args.json_model_override_args,
|
146
|
-
is_embedding=server_args.is_embedding,
|
147
|
-
enable_multimodal=server_args.enable_multimodal,
|
148
|
-
dtype=server_args.dtype,
|
149
|
-
quantization=server_args.quantization,
|
150
|
-
)
|
140
|
+
model_config = ModelConfig.from_server_args(server_args)
|
151
141
|
model_runner = ModelRunner(
|
152
142
|
model_config=model_config,
|
153
143
|
mem_fraction_static=server_args.mem_fraction_static,
|
sglang/bench_serving.py
CHANGED
@@ -58,6 +58,7 @@ class RequestFuncInput:
|
|
58
58
|
output_len: int
|
59
59
|
model: str
|
60
60
|
lora_name: str
|
61
|
+
image_data: str
|
61
62
|
extra_request_body: Dict[str, Any]
|
62
63
|
|
63
64
|
|
@@ -347,6 +348,11 @@ async def async_request_sglang_generate(
|
|
347
348
|
"logprob_start_len": -1,
|
348
349
|
**request_func_input.extra_request_body,
|
349
350
|
}
|
351
|
+
|
352
|
+
# Add image data if available
|
353
|
+
if request_func_input.image_data:
|
354
|
+
payload["image_data"] = request_func_input.image_data
|
355
|
+
|
350
356
|
headers = get_auth_headers()
|
351
357
|
|
352
358
|
output = RequestFuncOutput()
|
@@ -510,6 +516,13 @@ def get_dataset(args, tokenizer):
|
|
510
516
|
tokenizer=tokenizer,
|
511
517
|
args=args,
|
512
518
|
)
|
519
|
+
elif args.dataset_name == "mmmu":
|
520
|
+
input_requests = sample_mmmu_requests(
|
521
|
+
num_requests=args.num_prompts,
|
522
|
+
tokenizer=tokenizer,
|
523
|
+
fixed_output_len=args.random_output_len,
|
524
|
+
random_sample=True,
|
525
|
+
)
|
513
526
|
else:
|
514
527
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
515
528
|
return input_requests
|
@@ -597,6 +610,121 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|
597
610
|
return filename
|
598
611
|
|
599
612
|
|
613
|
+
def sample_mmmu_requests(
|
614
|
+
num_requests: int,
|
615
|
+
tokenizer: PreTrainedTokenizerBase,
|
616
|
+
fixed_output_len: Optional[int] = None,
|
617
|
+
random_sample: bool = True,
|
618
|
+
) -> List[Tuple[str, int, int]]:
|
619
|
+
"""
|
620
|
+
Sample requests from the MMMU dataset using HuggingFace datasets.
|
621
|
+
|
622
|
+
Args:
|
623
|
+
num_requests: Number of requests to sample.
|
624
|
+
tokenizer: Tokenizer to use for token counting.
|
625
|
+
fixed_output_len: If provided, use this fixed output length for all requests.
|
626
|
+
random_sample: Whether to randomly sample or take the first N.
|
627
|
+
|
628
|
+
Returns:
|
629
|
+
List of tuples (prompt, prompt_token_len, output_token_len).
|
630
|
+
"""
|
631
|
+
try:
|
632
|
+
import base64
|
633
|
+
import io
|
634
|
+
|
635
|
+
from datasets import load_dataset
|
636
|
+
except ImportError:
|
637
|
+
raise ImportError("Please install datasets: pip install datasets")
|
638
|
+
|
639
|
+
print("Loading MMMU dataset from HuggingFace...")
|
640
|
+
|
641
|
+
try:
|
642
|
+
print("Attempting to load MMMU Math dataset...")
|
643
|
+
mmmu_dataset = load_dataset("MMMU/MMMU", "Math", split="test")
|
644
|
+
print(
|
645
|
+
f"Successfully loaded MMMU Math dataset from HuggingFace with {len(mmmu_dataset)} examples"
|
646
|
+
)
|
647
|
+
except Exception as e:
|
648
|
+
print(f"Failed to load MMMU Math dataset: {e}")
|
649
|
+
raise ValueError(f"Failed to load MMMU dataset: {e}")
|
650
|
+
|
651
|
+
# Sample from the dataset
|
652
|
+
if len(mmmu_dataset) > num_requests:
|
653
|
+
if random_sample:
|
654
|
+
# Random sample
|
655
|
+
indices = random.sample(range(len(mmmu_dataset)), num_requests)
|
656
|
+
sample_dataset = mmmu_dataset.select(indices)
|
657
|
+
else:
|
658
|
+
# Take first N
|
659
|
+
sample_dataset = mmmu_dataset.select(
|
660
|
+
range(min(num_requests, len(mmmu_dataset)))
|
661
|
+
)
|
662
|
+
else:
|
663
|
+
print(f"Dataset has less than {num_requests} examples, using all examples")
|
664
|
+
sample_dataset = mmmu_dataset
|
665
|
+
|
666
|
+
print(f"Selected {len(sample_dataset)} examples for benchmarking")
|
667
|
+
|
668
|
+
# Create prompts
|
669
|
+
filtered_dataset = []
|
670
|
+
|
671
|
+
for i, example in enumerate(sample_dataset):
|
672
|
+
try:
|
673
|
+
# Extract image_1
|
674
|
+
image = example.get("image_1")
|
675
|
+
|
676
|
+
if image is not None:
|
677
|
+
if hasattr(image, "save"):
|
678
|
+
# Convert RGBA images to RGB before encoding
|
679
|
+
if image.mode == "RGBA":
|
680
|
+
image = image.convert("RGB")
|
681
|
+
|
682
|
+
# Encode image to base64
|
683
|
+
buffered = io.BytesIO()
|
684
|
+
image.save(buffered, format="JPEG")
|
685
|
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
686
|
+
image_path = f"data:image/jpeg;base64,{img_str}"
|
687
|
+
else:
|
688
|
+
continue
|
689
|
+
|
690
|
+
# Extract the question
|
691
|
+
question = example.get("question")
|
692
|
+
|
693
|
+
# Create the prompt with image, question
|
694
|
+
prompt = f"Question: {question}\n\nAnswer: "
|
695
|
+
prompt = tokenizer.apply_chat_template(
|
696
|
+
[
|
697
|
+
{
|
698
|
+
"role": "user",
|
699
|
+
"content": [
|
700
|
+
{"type": "image_url", "image_url": {"url": image_path}},
|
701
|
+
{"type": "text", "text": prompt},
|
702
|
+
],
|
703
|
+
}
|
704
|
+
],
|
705
|
+
add_generation_prompt=True,
|
706
|
+
tokenize=False,
|
707
|
+
)
|
708
|
+
prompt = f"<image>{image_path}</image>{prompt}"
|
709
|
+
|
710
|
+
# Calculate token lengths
|
711
|
+
# Note: This is approximate since we're not rendering the actual image tokens
|
712
|
+
prompt_token_ids = tokenizer.encode(prompt)
|
713
|
+
prompt_len = (
|
714
|
+
len(prompt_token_ids) + 512
|
715
|
+
) # Add estimate for image tokens
|
716
|
+
|
717
|
+
output_len = fixed_output_len if fixed_output_len is not None else 256
|
718
|
+
|
719
|
+
filtered_dataset.append((prompt, prompt_len, output_len))
|
720
|
+
|
721
|
+
except Exception as e:
|
722
|
+
print(f"Error processing example {i}: {e}")
|
723
|
+
|
724
|
+
print(f"\nCreated {len(filtered_dataset)} MMMU prompts")
|
725
|
+
return filtered_dataset
|
726
|
+
|
727
|
+
|
600
728
|
def sample_sharegpt_requests(
|
601
729
|
dataset_path: str,
|
602
730
|
num_requests: int,
|
@@ -1004,6 +1132,15 @@ async def benchmark(
|
|
1004
1132
|
else:
|
1005
1133
|
lora_name = None
|
1006
1134
|
|
1135
|
+
if "<image>" in test_prompt:
|
1136
|
+
import re
|
1137
|
+
|
1138
|
+
image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
|
1139
|
+
image_data = image_match.group(1) if image_match else None
|
1140
|
+
test_prompt = image_match.group(2) if image_match else test_prompt
|
1141
|
+
else:
|
1142
|
+
image_data = None
|
1143
|
+
|
1007
1144
|
# Create the test input once
|
1008
1145
|
test_input = RequestFuncInput(
|
1009
1146
|
model=model_id,
|
@@ -1012,6 +1149,7 @@ async def benchmark(
|
|
1012
1149
|
prompt_len=test_prompt_len,
|
1013
1150
|
output_len=min(test_output_len, 32),
|
1014
1151
|
lora_name=lora_name,
|
1152
|
+
image_data=image_data,
|
1015
1153
|
extra_request_body=extra_request_body,
|
1016
1154
|
)
|
1017
1155
|
|
@@ -1063,6 +1201,15 @@ async def benchmark(
|
|
1063
1201
|
else:
|
1064
1202
|
lora_name = None
|
1065
1203
|
|
1204
|
+
if "<image>" in prompt:
|
1205
|
+
import re
|
1206
|
+
|
1207
|
+
image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
|
1208
|
+
image_data = image_match.group(1) if image_match else None
|
1209
|
+
prompt = image_match.group(2) if image_match else prompt
|
1210
|
+
else:
|
1211
|
+
image_data = None
|
1212
|
+
|
1066
1213
|
request_func_input = RequestFuncInput(
|
1067
1214
|
model=model_id,
|
1068
1215
|
prompt=prompt,
|
@@ -1070,6 +1217,7 @@ async def benchmark(
|
|
1070
1217
|
prompt_len=prompt_len,
|
1071
1218
|
output_len=output_len,
|
1072
1219
|
lora_name=lora_name,
|
1220
|
+
image_data=image_data,
|
1073
1221
|
extra_request_body=extra_request_body,
|
1074
1222
|
)
|
1075
1223
|
tasks.append(
|
@@ -1444,7 +1592,7 @@ if __name__ == "__main__":
|
|
1444
1592
|
"--dataset-name",
|
1445
1593
|
type=str,
|
1446
1594
|
default="sharegpt",
|
1447
|
-
choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
|
1595
|
+
choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
|
1448
1596
|
help="Name of the dataset to benchmark on.",
|
1449
1597
|
)
|
1450
1598
|
parser.add_argument(
|
sglang/lang/chat_template.py
CHANGED
@@ -270,6 +270,29 @@ register_chat_template(
|
|
270
270
|
)
|
271
271
|
)
|
272
272
|
|
273
|
+
register_chat_template(
|
274
|
+
ChatTemplate(
|
275
|
+
name="janus",
|
276
|
+
default_system_prompt=None,
|
277
|
+
role_prefix_and_suffix={
|
278
|
+
"system": (
|
279
|
+
"",
|
280
|
+
"",
|
281
|
+
),
|
282
|
+
"user": (
|
283
|
+
"<|User|>",
|
284
|
+
"",
|
285
|
+
),
|
286
|
+
"assistant": (
|
287
|
+
"<|Assistant|>",
|
288
|
+
"<|end▁of▁sentence|>",
|
289
|
+
),
|
290
|
+
},
|
291
|
+
stop_str=("<|end▁of▁sentence|>",),
|
292
|
+
image_token="<image_placeholder>\n",
|
293
|
+
)
|
294
|
+
)
|
295
|
+
|
273
296
|
# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
|
274
297
|
register_chat_template(
|
275
298
|
ChatTemplate(
|
@@ -395,6 +418,20 @@ register_chat_template(
|
|
395
418
|
)
|
396
419
|
)
|
397
420
|
|
421
|
+
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
|
422
|
+
register_chat_template(
|
423
|
+
ChatTemplate(
|
424
|
+
name="internvl-2-5",
|
425
|
+
default_system_prompt="你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
|
426
|
+
role_prefix_and_suffix={
|
427
|
+
"system": ("<|im_start|>system\n", "<|im_end|>\n"),
|
428
|
+
"user": ("<|im_start|>user\n", "<|im_end|>\n"),
|
429
|
+
"assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
|
430
|
+
},
|
431
|
+
stop_str=["<|im_end|>", "<|action_end|>"],
|
432
|
+
)
|
433
|
+
)
|
434
|
+
|
398
435
|
register_chat_template(
|
399
436
|
ChatTemplate(
|
400
437
|
name="granite-3-instruct",
|
@@ -565,6 +602,13 @@ def match_gemma3_instruct(model_path: str):
|
|
565
602
|
return get_chat_template("gemma-it")
|
566
603
|
|
567
604
|
|
605
|
+
@register_chat_template_matching_function
|
606
|
+
def match_internvl_chat(model_path: str):
|
607
|
+
model_path = model_path.lower()
|
608
|
+
if "internvl" in model_path:
|
609
|
+
return get_chat_template("internvl-2-5")
|
610
|
+
|
611
|
+
|
568
612
|
if __name__ == "__main__":
|
569
613
|
messages = [
|
570
614
|
{"role": "system", "content": None}, # None means default
|
@@ -10,7 +10,7 @@ class DeviceConfig:
|
|
10
10
|
device: Optional[torch.device]
|
11
11
|
|
12
12
|
def __init__(self, device: str = "cuda") -> None:
|
13
|
-
if device in ["cuda", "xpu", "hpu", "cpu"]:
|
13
|
+
if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
|
14
14
|
self.device_type = device
|
15
15
|
else:
|
16
16
|
raise RuntimeError(f"Not supported device type: {device}")
|