sglang 0.4.3.post3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +2 -2
- sglang/lang/chat_template.py +29 -0
- sglang/srt/_custom_ops.py +19 -17
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/janus_pro.py +629 -0
- sglang/srt/configs/model_config.py +24 -14
- sglang/srt/conversation.py +80 -2
- sglang/srt/custom_op.py +64 -3
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
- sglang/srt/distributed/parallel_state.py +10 -1
- sglang/srt/entrypoints/engine.py +5 -3
- sglang/srt/entrypoints/http_server.py +1 -1
- sglang/srt/hf_transformers_utils.py +16 -1
- sglang/srt/layers/attention/flashinfer_backend.py +95 -49
- sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
- sglang/srt/layers/attention/triton_backend.py +5 -5
- sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
- sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
- sglang/srt/layers/attention/vision.py +43 -62
- sglang/srt/layers/linear.py +1 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +25 -9
- sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
- sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
- sglang/srt/layers/parameter.py +10 -0
- sglang/srt/layers/quantization/__init__.py +90 -68
- sglang/srt/layers/quantization/blockwise_int8.py +1 -2
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +174 -106
- sglang/srt/layers/quantization/fp8_kernel.py +210 -38
- sglang/srt/layers/quantization/fp8_utils.py +156 -15
- sglang/srt/layers/quantization/modelopt_quant.py +5 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
- sglang/srt/layers/quantization/w8a8_int8.py +152 -3
- sglang/srt/layers/rotary_embedding.py +5 -3
- sglang/srt/layers/sampler.py +29 -35
- sglang/srt/layers/vocab_parallel_embedding.py +0 -1
- sglang/srt/lora/backend/__init__.py +9 -12
- sglang/srt/managers/cache_controller.py +72 -8
- sglang/srt/managers/image_processor.py +37 -631
- sglang/srt/managers/image_processors/base_image_processor.py +219 -0
- sglang/srt/managers/image_processors/janus_pro.py +79 -0
- sglang/srt/managers/image_processors/llava.py +152 -0
- sglang/srt/managers/image_processors/minicpmv.py +86 -0
- sglang/srt/managers/image_processors/mlama.py +60 -0
- sglang/srt/managers/image_processors/qwen_vl.py +161 -0
- sglang/srt/managers/io_struct.py +33 -15
- sglang/srt/managers/multi_modality_padding.py +134 -0
- sglang/srt/managers/schedule_batch.py +212 -117
- sglang/srt/managers/schedule_policy.py +40 -8
- sglang/srt/managers/scheduler.py +258 -782
- sglang/srt/managers/scheduler_output_processor_mixin.py +611 -0
- sglang/srt/managers/tokenizer_manager.py +7 -6
- sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
- sglang/srt/mem_cache/base_prefix_cache.py +6 -8
- sglang/srt/mem_cache/chunk_cache.py +12 -44
- sglang/srt/mem_cache/hiradix_cache.py +63 -34
- sglang/srt/mem_cache/memory_pool.py +112 -46
- sglang/srt/mem_cache/paged_allocator.py +283 -0
- sglang/srt/mem_cache/radix_cache.py +117 -36
- sglang/srt/metrics/collector.py +8 -0
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +12 -8
- sglang/srt/model_executor/model_runner.py +153 -134
- sglang/srt/model_loader/loader.py +2 -1
- sglang/srt/model_loader/weight_utils.py +1 -1
- sglang/srt/models/deepseek_janus_pro.py +2127 -0
- sglang/srt/models/deepseek_nextn.py +23 -3
- sglang/srt/models/deepseek_v2.py +25 -19
- sglang/srt/models/minicpmv.py +28 -89
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/qwen2.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +25 -50
- sglang/srt/models/qwen2_vl.py +33 -49
- sglang/srt/openai_api/adapter.py +37 -15
- sglang/srt/openai_api/protocol.py +8 -1
- sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
- sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
- sglang/srt/server_args.py +19 -20
- sglang/srt/speculative/build_eagle_tree.py +6 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -11
- sglang/srt/speculative/eagle_utils.py +2 -1
- sglang/srt/speculative/eagle_worker.py +109 -38
- sglang/srt/utils.py +104 -9
- sglang/test/runners.py +104 -10
- sglang/test/test_block_fp8.py +106 -16
- sglang/test/test_custom_ops.py +88 -0
- sglang/test/test_utils.py +20 -4
- sglang/utils.py +0 -4
- sglang/version.py +1 -1
- {sglang-0.4.3.post3.dist-info → sglang-0.4.4.dist-info}/METADATA +9 -9
- {sglang-0.4.3.post3.dist-info → sglang-0.4.4.dist-info}/RECORD +128 -83
- {sglang-0.4.3.post3.dist-info → sglang-0.4.4.dist-info}/WHEEL +1 -1
- {sglang-0.4.3.post3.dist-info → sglang-0.4.4.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post3.dist-info → sglang-0.4.4.dist-info}/top_level.txt +0 -0
sglang/srt/conversation.py
CHANGED
@@ -44,6 +44,7 @@ class SeparatorStyle(IntEnum):
|
|
44
44
|
CHATGLM3 = auto()
|
45
45
|
DEEPSEEK_CHAT = auto()
|
46
46
|
METAMATH = auto()
|
47
|
+
QWEN2_VL_EMBED = auto()
|
47
48
|
|
48
49
|
|
49
50
|
@dataclasses.dataclass
|
@@ -110,6 +111,15 @@ class Conversation:
|
|
110
111
|
else:
|
111
112
|
ret += role + "\n"
|
112
113
|
return ret
|
114
|
+
elif self.sep_style == SeparatorStyle.QWEN2_VL_EMBED:
|
115
|
+
ret = "" if system_prompt == "" else system_prompt + self.sep
|
116
|
+
for role, message in self.messages:
|
117
|
+
if message:
|
118
|
+
ret += role + "\n" + message + self.sep
|
119
|
+
else:
|
120
|
+
ret += role + "\n"
|
121
|
+
ret += self.stop_str
|
122
|
+
return ret
|
113
123
|
elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
|
114
124
|
ret = system_prompt
|
115
125
|
for role, message in self.messages:
|
@@ -181,7 +191,7 @@ class Conversation:
|
|
181
191
|
|
182
192
|
for i, (role, message) in enumerate(self.messages):
|
183
193
|
if i % 2 == 0:
|
184
|
-
ret += f"[Round {i//2 + round_add_n}]{self.sep}"
|
194
|
+
ret += f"[Round {i // 2 + round_add_n}]{self.sep}"
|
185
195
|
|
186
196
|
if message:
|
187
197
|
ret += f"{role}:{message}{self.sep}"
|
@@ -366,6 +376,46 @@ def chat_template_exists(template_name: str) -> bool:
|
|
366
376
|
return template_name in chat_templates
|
367
377
|
|
368
378
|
|
379
|
+
def generate_embedding_convs(
|
380
|
+
texts: List[str], images: List[str], template_name: str
|
381
|
+
) -> List[Conversation]:
|
382
|
+
conv_template = chat_templates[template_name].copy()
|
383
|
+
convs = []
|
384
|
+
for text, image in zip(texts, images):
|
385
|
+
conv = Conversation(
|
386
|
+
name=conv_template.name,
|
387
|
+
system_template=conv_template.system_template,
|
388
|
+
system_message=conv_template.system_message,
|
389
|
+
roles=conv_template.roles,
|
390
|
+
messages=list(conv_template.messages), # prevent in-place modification
|
391
|
+
offset=conv_template.offset,
|
392
|
+
sep_style=SeparatorStyle(conv_template.sep_style),
|
393
|
+
sep=conv_template.sep,
|
394
|
+
sep2=conv_template.sep2,
|
395
|
+
stop_str=conv_template.stop_str,
|
396
|
+
image_data=[],
|
397
|
+
modalities=[],
|
398
|
+
image_token=conv_template.image_token,
|
399
|
+
)
|
400
|
+
real_content = ""
|
401
|
+
|
402
|
+
if image is not None:
|
403
|
+
image_token = (
|
404
|
+
conv.image_token + "\n"
|
405
|
+
if conv.name != "gme-qwen2-vl"
|
406
|
+
else conv.image_token
|
407
|
+
)
|
408
|
+
real_content += image_token
|
409
|
+
if text is not None:
|
410
|
+
real_content += text
|
411
|
+
conv.append_message(conv.roles[0], real_content)
|
412
|
+
# Add a blank message for the assistant.
|
413
|
+
conv.append_message(conv.roles[1], None)
|
414
|
+
convs.append(conv)
|
415
|
+
|
416
|
+
return convs
|
417
|
+
|
418
|
+
|
369
419
|
def generate_chat_conv(
|
370
420
|
request: ChatCompletionRequest, template_name: str
|
371
421
|
) -> Conversation:
|
@@ -403,7 +453,6 @@ def generate_chat_conv(
|
|
403
453
|
conv.system_message = getattr(message.content[0], "text", "")
|
404
454
|
elif msg_role == "user":
|
405
455
|
# Handle the various types of Chat Request content types here.
|
406
|
-
role = conv.roles[0]
|
407
456
|
if isinstance(message.content, str):
|
408
457
|
conv.append_message(conv.roles[0], message.content)
|
409
458
|
else:
|
@@ -555,6 +604,20 @@ register_conv_template(
|
|
555
604
|
)
|
556
605
|
)
|
557
606
|
|
607
|
+
# Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
|
608
|
+
register_conv_template(
|
609
|
+
Conversation(
|
610
|
+
name="gme-qwen2-vl",
|
611
|
+
system_message="You are a helpful assistant.",
|
612
|
+
system_template="<|im_start|>system\n{system_message}",
|
613
|
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
614
|
+
sep="<|im_end|>\n",
|
615
|
+
sep_style=SeparatorStyle.QWEN2_VL_EMBED,
|
616
|
+
stop_str="<|endoftext|>",
|
617
|
+
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
618
|
+
)
|
619
|
+
)
|
620
|
+
|
558
621
|
# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
|
559
622
|
register_conv_template(
|
560
623
|
Conversation(
|
@@ -568,3 +631,18 @@ register_conv_template(
|
|
568
631
|
image_token="(<image>./</image>)",
|
569
632
|
)
|
570
633
|
)
|
634
|
+
|
635
|
+
# Reference: https://github.com/deepseek-ai/Janus?tab=readme-ov-file#janus-pro
|
636
|
+
register_conv_template(
|
637
|
+
Conversation(
|
638
|
+
name="janus-pro",
|
639
|
+
system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language",
|
640
|
+
system_template="{system_message}.",
|
641
|
+
roles=("User", "Assistant"),
|
642
|
+
sep="\n\n",
|
643
|
+
sep2="<|end▁of▁sentence|>",
|
644
|
+
sep_style=SeparatorStyle.ADD_COLON_TWO,
|
645
|
+
stop_str=["<|User|>", "<|end▁of▁sentence|>"],
|
646
|
+
image_token="<image_placeholder>",
|
647
|
+
)
|
648
|
+
)
|
sglang/srt/custom_op.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
1
3
|
import torch
|
2
4
|
from torch import nn
|
3
5
|
|
4
|
-
|
5
|
-
|
6
|
+
from sglang.srt.utils import is_cuda, is_hip
|
7
|
+
|
8
|
+
_is_cuda = is_cuda()
|
9
|
+
_is_hip = is_hip()
|
6
10
|
|
7
11
|
|
8
12
|
class CustomOp(nn.Module):
|
@@ -34,7 +38,64 @@ class CustomOp(nn.Module):
|
|
34
38
|
def dispatch_forward(self):
|
35
39
|
if _is_cuda:
|
36
40
|
return self.forward_cuda
|
37
|
-
elif
|
41
|
+
elif _is_hip:
|
38
42
|
return self.forward_hip
|
39
43
|
else:
|
40
44
|
return self.forward_native
|
45
|
+
|
46
|
+
|
47
|
+
if _is_cuda:
|
48
|
+
from sgl_kernel import sgl_per_tensor_quant_fp8, sgl_per_token_quant_fp8
|
49
|
+
|
50
|
+
def scaled_fp8_quant(
|
51
|
+
input: torch.Tensor,
|
52
|
+
scale: Optional[torch.Tensor] = None,
|
53
|
+
use_per_token_if_dynamic: bool = False,
|
54
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
55
|
+
"""
|
56
|
+
Quantize input tensor to FP8 (8-bit floating point) format.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
input (torch.Tensor): Input tensor to be quantized
|
60
|
+
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
|
61
|
+
If None, scales will be computed dynamically.
|
62
|
+
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
|
63
|
+
determines the quantization granularity:
|
64
|
+
- True: compute scale per token
|
65
|
+
- False: compute single scale per tensor
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
|
69
|
+
- quantized_tensor: The FP8 quantized version of input
|
70
|
+
- scale_tensor: The scaling factors used for quantization
|
71
|
+
|
72
|
+
Raises:
|
73
|
+
AssertionError: If input is not 2D or if static scale's numel != 1
|
74
|
+
"""
|
75
|
+
assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
|
76
|
+
shape = input.shape
|
77
|
+
out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
|
78
|
+
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
79
|
+
|
80
|
+
if scale is None:
|
81
|
+
# Dynamic scaling
|
82
|
+
if use_per_token_if_dynamic:
|
83
|
+
scale = torch.empty(
|
84
|
+
(shape[0], 1), device=input.device, dtype=torch.float32
|
85
|
+
)
|
86
|
+
sgl_per_token_quant_fp8(input, output, scale)
|
87
|
+
else:
|
88
|
+
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
89
|
+
sgl_per_tensor_quant_fp8(
|
90
|
+
input, output, scale, is_static=False
|
91
|
+
) # False for dynamic
|
92
|
+
else:
|
93
|
+
# Static scaling
|
94
|
+
assert (
|
95
|
+
scale.numel() == 1
|
96
|
+
), f"Expected scalar scale, got numel={scale.numel()}"
|
97
|
+
sgl_per_tensor_quant_fp8(
|
98
|
+
input, output, scale, is_static=True
|
99
|
+
) # True for static
|
100
|
+
|
101
|
+
return output, scale
|
@@ -22,15 +22,16 @@ from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
|
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
25
|
-
|
25
|
+
_is_cuda = is_cuda()
|
26
|
+
_is_hip = is_hip()
|
26
27
|
|
27
|
-
if
|
28
|
+
if _is_cuda:
|
28
29
|
try:
|
29
30
|
import pynvml
|
30
31
|
except ImportError as e:
|
31
32
|
logger.warning("Failed to import pynvml with %r", e)
|
32
33
|
|
33
|
-
if
|
34
|
+
if _is_hip:
|
34
35
|
try:
|
35
36
|
from amdsmi import (
|
36
37
|
AmdSmiException,
|
@@ -43,7 +44,7 @@ if is_hip_:
|
|
43
44
|
logger.warning("Failed to import amdsmi with %r", e)
|
44
45
|
|
45
46
|
try:
|
46
|
-
if ops.use_vllm_custom_allreduce and not
|
47
|
+
if ops.use_vllm_custom_allreduce and not _is_hip:
|
47
48
|
# Use vLLM custom allreduce
|
48
49
|
ops.meta_size()
|
49
50
|
else:
|
@@ -63,7 +64,7 @@ _R = TypeVar("_R")
|
|
63
64
|
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
64
65
|
@wraps(fn)
|
65
66
|
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
66
|
-
if
|
67
|
+
if _is_hip:
|
67
68
|
try:
|
68
69
|
amdsmi_init()
|
69
70
|
return fn(*args, **kwargs)
|
@@ -81,7 +82,7 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
|
81
82
|
|
82
83
|
@with_nvml_context
|
83
84
|
def is_full_nvlink(physical_device_ids: List[int], world_size: int) -> bool:
|
84
|
-
if
|
85
|
+
if _is_hip:
|
85
86
|
"""
|
86
87
|
query if the set of gpus are fully connected by xgmi (1 hop)
|
87
88
|
"""
|
@@ -145,7 +146,7 @@ def is_weak_contiguous(inp: torch.Tensor):
|
|
145
146
|
class CustomAllreduce:
|
146
147
|
_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
|
147
148
|
_MAX_CAR_SIZE = 8192 * 1024
|
148
|
-
if
|
149
|
+
if _is_hip:
|
149
150
|
# crossover is at 16MB buffer size for ROCm
|
150
151
|
_MAX_CAR_SIZE = 2 * 8192 * 1024
|
151
152
|
|
@@ -229,7 +230,7 @@ class CustomAllreduce:
|
|
229
230
|
# test nvlink first, this will filter out most of the cases
|
230
231
|
# where custom allreduce is not supported
|
231
232
|
# this checks hardware and driver support for NVLink
|
232
|
-
if
|
233
|
+
if _is_cuda or _is_hip:
|
233
234
|
full_nvlink = is_full_nvlink(physical_device_ids, world_size)
|
234
235
|
|
235
236
|
if world_size > 2 and not full_nvlink:
|
@@ -243,7 +244,7 @@ class CustomAllreduce:
|
|
243
244
|
# this is expensive to compute at the first time
|
244
245
|
# then we cache the result
|
245
246
|
# On AMD GPU, p2p is always enabled between XGMI connected GPUs
|
246
|
-
if not
|
247
|
+
if not _is_hip and not _can_p2p(rank, world_size):
|
247
248
|
logger.warning(
|
248
249
|
"Custom allreduce is disabled because your platform lacks "
|
249
250
|
"GPU P2P capability or P2P test failed. To silence this "
|
@@ -256,7 +257,7 @@ class CustomAllreduce:
|
|
256
257
|
self.world_size = world_size
|
257
258
|
self.full_nvlink = full_nvlink
|
258
259
|
|
259
|
-
if ops.use_vllm_custom_allreduce and not
|
260
|
+
if ops.use_vllm_custom_allreduce and not _is_hip:
|
260
261
|
# Buffers memory are owned by this Python class and passed to C++.
|
261
262
|
# Meta data composes of two parts: meta data for synchronization and a
|
262
263
|
# temporary buffer for storing intermediate allreduce results.
|
@@ -279,7 +280,7 @@ class CustomAllreduce:
|
|
279
280
|
)
|
280
281
|
ops.register_buffer(self._ptr, self.buffer_ptrs)
|
281
282
|
else:
|
282
|
-
if
|
283
|
+
if _is_hip:
|
283
284
|
# meta data buffers need to be "uncached" for signal on MI200
|
284
285
|
self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
|
285
286
|
self.buffer = torch.empty(
|
@@ -418,7 +419,7 @@ class CustomAllreduce:
|
|
418
419
|
ops.register_buffer(self._ptr, inp, handles, offsets)
|
419
420
|
|
420
421
|
def register_graph_buffers(self):
|
421
|
-
if
|
422
|
+
if _is_hip:
|
422
423
|
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
423
424
|
handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
|
424
425
|
logger.info("Registering %d cuda graph addresses", len(offset))
|
@@ -454,12 +455,12 @@ class CustomAllreduce:
|
|
454
455
|
return False
|
455
456
|
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
456
457
|
# little performance improvement over NCCL.
|
457
|
-
if ops.use_vllm_custom_allreduce and not
|
458
|
+
if ops.use_vllm_custom_allreduce and not _is_hip:
|
458
459
|
if self.world_size == 2 or self.full_nvlink:
|
459
460
|
return inp_size < self.max_size
|
460
461
|
return False
|
461
462
|
|
462
|
-
if
|
463
|
+
if _is_hip:
|
463
464
|
if self.full_nvlink:
|
464
465
|
if self.world_size == 8:
|
465
466
|
if self.MSCCL:
|
@@ -532,7 +533,7 @@ class CustomAllreduce:
|
|
532
533
|
return None
|
533
534
|
if self._IS_CAPTURING:
|
534
535
|
if torch.cuda.is_current_stream_capturing():
|
535
|
-
if
|
536
|
+
if _is_hip:
|
536
537
|
return self.all_reduce_reg(input)
|
537
538
|
else:
|
538
539
|
return self.all_reduce(input, registered=True)
|
@@ -541,7 +542,7 @@ class CustomAllreduce:
|
|
541
542
|
# allreduce is out-of-place.
|
542
543
|
return torch.empty_like(input)
|
543
544
|
else:
|
544
|
-
if
|
545
|
+
if _is_hip:
|
545
546
|
# note: outside of cuda graph context,
|
546
547
|
# custom allreduce incurs a cost of cudaMemcpy, which should
|
547
548
|
# be small(<=1% of overall latency) compared to the performance
|
@@ -556,7 +557,7 @@ class CustomAllreduce:
|
|
556
557
|
if ops.use_vllm_custom_allreduce:
|
557
558
|
self.free_shared_buffer(self.meta_ptrs)
|
558
559
|
self.free_shared_buffer(self.buffer_ptrs)
|
559
|
-
elif
|
560
|
+
elif _is_cuda:
|
560
561
|
self.free_shared_buffer(self.buffer_ptrs)
|
561
562
|
self.free_shared_buffer(self.tmp_result_buffer_ptrs)
|
562
563
|
self.free_shared_buffer(self.barrier_in_ptrs)
|
@@ -1228,7 +1228,16 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
|
1228
1228
|
ray.shutdown()
|
1229
1229
|
gc.collect()
|
1230
1230
|
if not current_platform.is_cpu():
|
1231
|
-
torch.cuda.
|
1231
|
+
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1232
|
+
torch.cuda.empty_cache()
|
1233
|
+
if hasattr(torch._C, "_host_emptyCache"):
|
1234
|
+
torch._C._host_emptyCache()
|
1235
|
+
else:
|
1236
|
+
logger.warning(
|
1237
|
+
"torch._C._host_emptyCache() only available in Pytorch >=2.5"
|
1238
|
+
)
|
1239
|
+
elif hasattr(torch, "xpu") and torch.xpu.is_available():
|
1240
|
+
torch.xpu.empty_cache()
|
1232
1241
|
|
1233
1242
|
|
1234
1243
|
def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -106,6 +106,8 @@ class Engine:
|
|
106
106
|
tokenizer_manager, scheduler_info = _launch_subprocesses(
|
107
107
|
server_args=server_args
|
108
108
|
)
|
109
|
+
|
110
|
+
self.server_args = server_args
|
109
111
|
self.tokenizer_manager = tokenizer_manager
|
110
112
|
self.scheduler_info = scheduler_info
|
111
113
|
|
@@ -214,13 +216,13 @@ class Engine:
|
|
214
216
|
def encode(
|
215
217
|
self,
|
216
218
|
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
219
|
+
image_data: Optional[Union[List[str], str]] = None,
|
217
220
|
) -> Dict:
|
218
221
|
"""
|
219
222
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
|
220
223
|
Please refer to `EmbeddingReqInput` for the documentation.
|
221
224
|
"""
|
222
|
-
|
223
|
-
obj = EmbeddingReqInput(text=prompt)
|
225
|
+
obj = EmbeddingReqInput(text=prompt, image_data=image_data)
|
224
226
|
loop = asyncio.get_event_loop()
|
225
227
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
226
228
|
ret = loop.run_until_complete(generator.__anext__())
|
@@ -374,7 +376,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
374
376
|
if server_args.attention_backend == "flashinfer":
|
375
377
|
assert_pkg_version(
|
376
378
|
"flashinfer_python",
|
377
|
-
"0.2.
|
379
|
+
"0.2.3",
|
378
380
|
"Please uninstall the old version and "
|
379
381
|
"reinstall the latest version by following the instructions "
|
380
382
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -614,7 +614,7 @@ def launch_server(
|
|
614
614
|
|
615
615
|
Note:
|
616
616
|
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
617
|
-
2. Inter-process communication is done through
|
617
|
+
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
618
618
|
"""
|
619
619
|
tokenizer_manager, scheduler_info = _launch_subprocesses(server_args=server_args)
|
620
620
|
set_global_state(
|
@@ -30,13 +30,20 @@ from transformers import (
|
|
30
30
|
)
|
31
31
|
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
32
32
|
|
33
|
-
from sglang.srt.configs import
|
33
|
+
from sglang.srt.configs import (
|
34
|
+
ChatGLMConfig,
|
35
|
+
DbrxConfig,
|
36
|
+
ExaoneConfig,
|
37
|
+
MultiModalityConfig,
|
38
|
+
Qwen2_5_VLConfig,
|
39
|
+
)
|
34
40
|
|
35
41
|
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
36
42
|
ChatGLMConfig.model_type: ChatGLMConfig,
|
37
43
|
DbrxConfig.model_type: DbrxConfig,
|
38
44
|
ExaoneConfig.model_type: ExaoneConfig,
|
39
45
|
Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
|
46
|
+
MultiModalityConfig.model_type: MultiModalityConfig,
|
40
47
|
}
|
41
48
|
|
42
49
|
for name, cls in _CONFIG_REGISTRY.items():
|
@@ -66,6 +73,14 @@ def get_config(
|
|
66
73
|
config = AutoConfig.from_pretrained(
|
67
74
|
model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
|
68
75
|
)
|
76
|
+
|
77
|
+
# FIXME: Pour contents of janus-pro's langauge_config to first-level
|
78
|
+
if isinstance(model, str) and model.lower().startswith("deepseek-ai/janus-pro"):
|
79
|
+
assert hasattr(config, "language_config")
|
80
|
+
for key, val in config.language_config.__dict__.items():
|
81
|
+
setattr(config, key, val)
|
82
|
+
setattr(config, "architectures", ["MultiModalityCausalLM"])
|
83
|
+
|
69
84
|
if config.model_type in _CONFIG_REGISTRY:
|
70
85
|
config_class = _CONFIG_REGISTRY[config.model_type]
|
71
86
|
config = config_class.from_pretrained(model, revision=revision)
|