sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +1 -1
- sglang/lang/chat_template.py +29 -0
- sglang/srt/_custom_ops.py +19 -17
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/janus_pro.py +629 -0
- sglang/srt/configs/model_config.py +24 -14
- sglang/srt/conversation.py +80 -2
- sglang/srt/custom_op.py +64 -3
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
- sglang/srt/distributed/parallel_state.py +10 -1
- sglang/srt/entrypoints/engine.py +5 -3
- sglang/srt/entrypoints/http_server.py +1 -1
- sglang/srt/function_call_parser.py +33 -2
- sglang/srt/hf_transformers_utils.py +16 -1
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
- sglang/srt/layers/attention/triton_backend.py +1 -3
- sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
- sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
- sglang/srt/layers/attention/vision.py +43 -62
- sglang/srt/layers/dp_attention.py +30 -2
- sglang/srt/layers/elementwise.py +411 -0
- sglang/srt/layers/linear.py +1 -1
- sglang/srt/layers/logits_processor.py +1 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +25 -9
- sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
- sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
- sglang/srt/layers/moe/router.py +342 -0
- sglang/srt/layers/parameter.py +10 -0
- sglang/srt/layers/quantization/__init__.py +90 -68
- sglang/srt/layers/quantization/blockwise_int8.py +1 -2
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +174 -106
- sglang/srt/layers/quantization/fp8_kernel.py +210 -38
- sglang/srt/layers/quantization/fp8_utils.py +156 -15
- sglang/srt/layers/quantization/modelopt_quant.py +5 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
- sglang/srt/layers/quantization/w8a8_int8.py +152 -3
- sglang/srt/layers/rotary_embedding.py +5 -3
- sglang/srt/layers/sampler.py +29 -35
- sglang/srt/layers/vocab_parallel_embedding.py +0 -1
- sglang/srt/lora/backend/__init__.py +9 -12
- sglang/srt/managers/cache_controller.py +74 -8
- sglang/srt/managers/data_parallel_controller.py +1 -1
- sglang/srt/managers/image_processor.py +37 -631
- sglang/srt/managers/image_processors/base_image_processor.py +219 -0
- sglang/srt/managers/image_processors/janus_pro.py +79 -0
- sglang/srt/managers/image_processors/llava.py +152 -0
- sglang/srt/managers/image_processors/minicpmv.py +86 -0
- sglang/srt/managers/image_processors/mlama.py +60 -0
- sglang/srt/managers/image_processors/qwen_vl.py +161 -0
- sglang/srt/managers/io_struct.py +32 -15
- sglang/srt/managers/multi_modality_padding.py +134 -0
- sglang/srt/managers/schedule_batch.py +213 -118
- sglang/srt/managers/schedule_policy.py +40 -8
- sglang/srt/managers/scheduler.py +176 -683
- sglang/srt/managers/scheduler_output_processor_mixin.py +614 -0
- sglang/srt/managers/tokenizer_manager.py +6 -6
- sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
- sglang/srt/mem_cache/base_prefix_cache.py +6 -8
- sglang/srt/mem_cache/chunk_cache.py +12 -44
- sglang/srt/mem_cache/hiradix_cache.py +71 -34
- sglang/srt/mem_cache/memory_pool.py +81 -17
- sglang/srt/mem_cache/paged_allocator.py +283 -0
- sglang/srt/mem_cache/radix_cache.py +117 -36
- sglang/srt/model_executor/cuda_graph_runner.py +68 -20
- sglang/srt/model_executor/forward_batch_info.py +23 -10
- sglang/srt/model_executor/model_runner.py +63 -63
- sglang/srt/model_loader/loader.py +2 -1
- sglang/srt/model_loader/weight_utils.py +1 -1
- sglang/srt/models/deepseek_janus_pro.py +2127 -0
- sglang/srt/models/deepseek_nextn.py +23 -3
- sglang/srt/models/deepseek_v2.py +200 -191
- sglang/srt/models/grok.py +374 -119
- sglang/srt/models/minicpmv.py +28 -89
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/qwen2.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +25 -50
- sglang/srt/models/qwen2_vl.py +33 -49
- sglang/srt/openai_api/adapter.py +59 -35
- sglang/srt/openai_api/protocol.py +8 -1
- sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
- sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
- sglang/srt/server_args.py +24 -16
- sglang/srt/speculative/eagle_worker.py +75 -39
- sglang/srt/utils.py +104 -9
- sglang/test/runners.py +104 -10
- sglang/test/test_block_fp8.py +106 -16
- sglang/test/test_custom_ops.py +88 -0
- sglang/test/test_utils.py +20 -4
- sglang/utils.py +0 -4
- sglang/version.py +1 -1
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/METADATA +9 -10
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/RECORD +131 -84
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/WHEEL +1 -1
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/top_level.txt +0 -0
sglang/srt/conversation.py
CHANGED
@@ -44,6 +44,7 @@ class SeparatorStyle(IntEnum):
|
|
44
44
|
CHATGLM3 = auto()
|
45
45
|
DEEPSEEK_CHAT = auto()
|
46
46
|
METAMATH = auto()
|
47
|
+
QWEN2_VL_EMBED = auto()
|
47
48
|
|
48
49
|
|
49
50
|
@dataclasses.dataclass
|
@@ -110,6 +111,15 @@ class Conversation:
|
|
110
111
|
else:
|
111
112
|
ret += role + "\n"
|
112
113
|
return ret
|
114
|
+
elif self.sep_style == SeparatorStyle.QWEN2_VL_EMBED:
|
115
|
+
ret = "" if system_prompt == "" else system_prompt + self.sep
|
116
|
+
for role, message in self.messages:
|
117
|
+
if message:
|
118
|
+
ret += role + "\n" + message + self.sep
|
119
|
+
else:
|
120
|
+
ret += role + "\n"
|
121
|
+
ret += self.stop_str
|
122
|
+
return ret
|
113
123
|
elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
|
114
124
|
ret = system_prompt
|
115
125
|
for role, message in self.messages:
|
@@ -181,7 +191,7 @@ class Conversation:
|
|
181
191
|
|
182
192
|
for i, (role, message) in enumerate(self.messages):
|
183
193
|
if i % 2 == 0:
|
184
|
-
ret += f"[Round {i//2 + round_add_n}]{self.sep}"
|
194
|
+
ret += f"[Round {i // 2 + round_add_n}]{self.sep}"
|
185
195
|
|
186
196
|
if message:
|
187
197
|
ret += f"{role}:{message}{self.sep}"
|
@@ -366,6 +376,46 @@ def chat_template_exists(template_name: str) -> bool:
|
|
366
376
|
return template_name in chat_templates
|
367
377
|
|
368
378
|
|
379
|
+
def generate_embedding_convs(
|
380
|
+
texts: List[str], images: List[str], template_name: str
|
381
|
+
) -> List[Conversation]:
|
382
|
+
conv_template = chat_templates[template_name].copy()
|
383
|
+
convs = []
|
384
|
+
for text, image in zip(texts, images):
|
385
|
+
conv = Conversation(
|
386
|
+
name=conv_template.name,
|
387
|
+
system_template=conv_template.system_template,
|
388
|
+
system_message=conv_template.system_message,
|
389
|
+
roles=conv_template.roles,
|
390
|
+
messages=list(conv_template.messages), # prevent in-place modification
|
391
|
+
offset=conv_template.offset,
|
392
|
+
sep_style=SeparatorStyle(conv_template.sep_style),
|
393
|
+
sep=conv_template.sep,
|
394
|
+
sep2=conv_template.sep2,
|
395
|
+
stop_str=conv_template.stop_str,
|
396
|
+
image_data=[],
|
397
|
+
modalities=[],
|
398
|
+
image_token=conv_template.image_token,
|
399
|
+
)
|
400
|
+
real_content = ""
|
401
|
+
|
402
|
+
if image is not None:
|
403
|
+
image_token = (
|
404
|
+
conv.image_token + "\n"
|
405
|
+
if conv.name != "gme-qwen2-vl"
|
406
|
+
else conv.image_token
|
407
|
+
)
|
408
|
+
real_content += image_token
|
409
|
+
if text is not None:
|
410
|
+
real_content += text
|
411
|
+
conv.append_message(conv.roles[0], real_content)
|
412
|
+
# Add a blank message for the assistant.
|
413
|
+
conv.append_message(conv.roles[1], None)
|
414
|
+
convs.append(conv)
|
415
|
+
|
416
|
+
return convs
|
417
|
+
|
418
|
+
|
369
419
|
def generate_chat_conv(
|
370
420
|
request: ChatCompletionRequest, template_name: str
|
371
421
|
) -> Conversation:
|
@@ -403,7 +453,6 @@ def generate_chat_conv(
|
|
403
453
|
conv.system_message = getattr(message.content[0], "text", "")
|
404
454
|
elif msg_role == "user":
|
405
455
|
# Handle the various types of Chat Request content types here.
|
406
|
-
role = conv.roles[0]
|
407
456
|
if isinstance(message.content, str):
|
408
457
|
conv.append_message(conv.roles[0], message.content)
|
409
458
|
else:
|
@@ -555,6 +604,20 @@ register_conv_template(
|
|
555
604
|
)
|
556
605
|
)
|
557
606
|
|
607
|
+
# Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
|
608
|
+
register_conv_template(
|
609
|
+
Conversation(
|
610
|
+
name="gme-qwen2-vl",
|
611
|
+
system_message="You are a helpful assistant.",
|
612
|
+
system_template="<|im_start|>system\n{system_message}",
|
613
|
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
614
|
+
sep="<|im_end|>\n",
|
615
|
+
sep_style=SeparatorStyle.QWEN2_VL_EMBED,
|
616
|
+
stop_str="<|endoftext|>",
|
617
|
+
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
618
|
+
)
|
619
|
+
)
|
620
|
+
|
558
621
|
# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
|
559
622
|
register_conv_template(
|
560
623
|
Conversation(
|
@@ -568,3 +631,18 @@ register_conv_template(
|
|
568
631
|
image_token="(<image>./</image>)",
|
569
632
|
)
|
570
633
|
)
|
634
|
+
|
635
|
+
# Reference: https://github.com/deepseek-ai/Janus?tab=readme-ov-file#janus-pro
|
636
|
+
register_conv_template(
|
637
|
+
Conversation(
|
638
|
+
name="janus-pro",
|
639
|
+
system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language",
|
640
|
+
system_template="{system_message}.",
|
641
|
+
roles=("User", "Assistant"),
|
642
|
+
sep="\n\n",
|
643
|
+
sep2="<|end▁of▁sentence|>",
|
644
|
+
sep_style=SeparatorStyle.ADD_COLON_TWO,
|
645
|
+
stop_str=["<|User|>", "<|end▁of▁sentence|>"],
|
646
|
+
image_token="<image_placeholder>",
|
647
|
+
)
|
648
|
+
)
|
sglang/srt/custom_op.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
1
3
|
import torch
|
2
4
|
from torch import nn
|
3
5
|
|
4
|
-
|
5
|
-
|
6
|
+
from sglang.srt.utils import is_cuda, is_hip
|
7
|
+
|
8
|
+
_is_cuda = is_cuda()
|
9
|
+
_is_hip = is_hip()
|
6
10
|
|
7
11
|
|
8
12
|
class CustomOp(nn.Module):
|
@@ -34,7 +38,64 @@ class CustomOp(nn.Module):
|
|
34
38
|
def dispatch_forward(self):
|
35
39
|
if _is_cuda:
|
36
40
|
return self.forward_cuda
|
37
|
-
elif
|
41
|
+
elif _is_hip:
|
38
42
|
return self.forward_hip
|
39
43
|
else:
|
40
44
|
return self.forward_native
|
45
|
+
|
46
|
+
|
47
|
+
if _is_cuda:
|
48
|
+
from sgl_kernel import sgl_per_tensor_quant_fp8, sgl_per_token_quant_fp8
|
49
|
+
|
50
|
+
def scaled_fp8_quant(
|
51
|
+
input: torch.Tensor,
|
52
|
+
scale: Optional[torch.Tensor] = None,
|
53
|
+
use_per_token_if_dynamic: bool = False,
|
54
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
55
|
+
"""
|
56
|
+
Quantize input tensor to FP8 (8-bit floating point) format.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
input (torch.Tensor): Input tensor to be quantized
|
60
|
+
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
|
61
|
+
If None, scales will be computed dynamically.
|
62
|
+
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
|
63
|
+
determines the quantization granularity:
|
64
|
+
- True: compute scale per token
|
65
|
+
- False: compute single scale per tensor
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
|
69
|
+
- quantized_tensor: The FP8 quantized version of input
|
70
|
+
- scale_tensor: The scaling factors used for quantization
|
71
|
+
|
72
|
+
Raises:
|
73
|
+
AssertionError: If input is not 2D or if static scale's numel != 1
|
74
|
+
"""
|
75
|
+
assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
|
76
|
+
shape = input.shape
|
77
|
+
out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
|
78
|
+
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
79
|
+
|
80
|
+
if scale is None:
|
81
|
+
# Dynamic scaling
|
82
|
+
if use_per_token_if_dynamic:
|
83
|
+
scale = torch.empty(
|
84
|
+
(shape[0], 1), device=input.device, dtype=torch.float32
|
85
|
+
)
|
86
|
+
sgl_per_token_quant_fp8(input, output, scale)
|
87
|
+
else:
|
88
|
+
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
89
|
+
sgl_per_tensor_quant_fp8(
|
90
|
+
input, output, scale, is_static=False
|
91
|
+
) # False for dynamic
|
92
|
+
else:
|
93
|
+
# Static scaling
|
94
|
+
assert (
|
95
|
+
scale.numel() == 1
|
96
|
+
), f"Expected scalar scale, got numel={scale.numel()}"
|
97
|
+
sgl_per_tensor_quant_fp8(
|
98
|
+
input, output, scale, is_static=True
|
99
|
+
) # True for static
|
100
|
+
|
101
|
+
return output, scale
|
@@ -22,15 +22,16 @@ from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
|
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
25
|
-
|
25
|
+
_is_cuda = is_cuda()
|
26
|
+
_is_hip = is_hip()
|
26
27
|
|
27
|
-
if
|
28
|
+
if _is_cuda:
|
28
29
|
try:
|
29
30
|
import pynvml
|
30
31
|
except ImportError as e:
|
31
32
|
logger.warning("Failed to import pynvml with %r", e)
|
32
33
|
|
33
|
-
if
|
34
|
+
if _is_hip:
|
34
35
|
try:
|
35
36
|
from amdsmi import (
|
36
37
|
AmdSmiException,
|
@@ -43,7 +44,7 @@ if is_hip_:
|
|
43
44
|
logger.warning("Failed to import amdsmi with %r", e)
|
44
45
|
|
45
46
|
try:
|
46
|
-
if ops.use_vllm_custom_allreduce and not
|
47
|
+
if ops.use_vllm_custom_allreduce and not _is_hip:
|
47
48
|
# Use vLLM custom allreduce
|
48
49
|
ops.meta_size()
|
49
50
|
else:
|
@@ -63,7 +64,7 @@ _R = TypeVar("_R")
|
|
63
64
|
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
64
65
|
@wraps(fn)
|
65
66
|
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
66
|
-
if
|
67
|
+
if _is_hip:
|
67
68
|
try:
|
68
69
|
amdsmi_init()
|
69
70
|
return fn(*args, **kwargs)
|
@@ -81,7 +82,7 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
|
81
82
|
|
82
83
|
@with_nvml_context
|
83
84
|
def is_full_nvlink(physical_device_ids: List[int], world_size: int) -> bool:
|
84
|
-
if
|
85
|
+
if _is_hip:
|
85
86
|
"""
|
86
87
|
query if the set of gpus are fully connected by xgmi (1 hop)
|
87
88
|
"""
|
@@ -145,7 +146,7 @@ def is_weak_contiguous(inp: torch.Tensor):
|
|
145
146
|
class CustomAllreduce:
|
146
147
|
_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
|
147
148
|
_MAX_CAR_SIZE = 8192 * 1024
|
148
|
-
if
|
149
|
+
if _is_hip:
|
149
150
|
# crossover is at 16MB buffer size for ROCm
|
150
151
|
_MAX_CAR_SIZE = 2 * 8192 * 1024
|
151
152
|
|
@@ -229,7 +230,7 @@ class CustomAllreduce:
|
|
229
230
|
# test nvlink first, this will filter out most of the cases
|
230
231
|
# where custom allreduce is not supported
|
231
232
|
# this checks hardware and driver support for NVLink
|
232
|
-
if
|
233
|
+
if _is_cuda or _is_hip:
|
233
234
|
full_nvlink = is_full_nvlink(physical_device_ids, world_size)
|
234
235
|
|
235
236
|
if world_size > 2 and not full_nvlink:
|
@@ -243,7 +244,7 @@ class CustomAllreduce:
|
|
243
244
|
# this is expensive to compute at the first time
|
244
245
|
# then we cache the result
|
245
246
|
# On AMD GPU, p2p is always enabled between XGMI connected GPUs
|
246
|
-
if not
|
247
|
+
if not _is_hip and not _can_p2p(rank, world_size):
|
247
248
|
logger.warning(
|
248
249
|
"Custom allreduce is disabled because your platform lacks "
|
249
250
|
"GPU P2P capability or P2P test failed. To silence this "
|
@@ -256,7 +257,7 @@ class CustomAllreduce:
|
|
256
257
|
self.world_size = world_size
|
257
258
|
self.full_nvlink = full_nvlink
|
258
259
|
|
259
|
-
if ops.use_vllm_custom_allreduce and not
|
260
|
+
if ops.use_vllm_custom_allreduce and not _is_hip:
|
260
261
|
# Buffers memory are owned by this Python class and passed to C++.
|
261
262
|
# Meta data composes of two parts: meta data for synchronization and a
|
262
263
|
# temporary buffer for storing intermediate allreduce results.
|
@@ -279,7 +280,7 @@ class CustomAllreduce:
|
|
279
280
|
)
|
280
281
|
ops.register_buffer(self._ptr, self.buffer_ptrs)
|
281
282
|
else:
|
282
|
-
if
|
283
|
+
if _is_hip:
|
283
284
|
# meta data buffers need to be "uncached" for signal on MI200
|
284
285
|
self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
|
285
286
|
self.buffer = torch.empty(
|
@@ -418,7 +419,7 @@ class CustomAllreduce:
|
|
418
419
|
ops.register_buffer(self._ptr, inp, handles, offsets)
|
419
420
|
|
420
421
|
def register_graph_buffers(self):
|
421
|
-
if
|
422
|
+
if _is_hip:
|
422
423
|
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
423
424
|
handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
|
424
425
|
logger.info("Registering %d cuda graph addresses", len(offset))
|
@@ -454,12 +455,12 @@ class CustomAllreduce:
|
|
454
455
|
return False
|
455
456
|
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
456
457
|
# little performance improvement over NCCL.
|
457
|
-
if ops.use_vllm_custom_allreduce and not
|
458
|
+
if ops.use_vllm_custom_allreduce and not _is_hip:
|
458
459
|
if self.world_size == 2 or self.full_nvlink:
|
459
460
|
return inp_size < self.max_size
|
460
461
|
return False
|
461
462
|
|
462
|
-
if
|
463
|
+
if _is_hip:
|
463
464
|
if self.full_nvlink:
|
464
465
|
if self.world_size == 8:
|
465
466
|
if self.MSCCL:
|
@@ -532,7 +533,7 @@ class CustomAllreduce:
|
|
532
533
|
return None
|
533
534
|
if self._IS_CAPTURING:
|
534
535
|
if torch.cuda.is_current_stream_capturing():
|
535
|
-
if
|
536
|
+
if _is_hip:
|
536
537
|
return self.all_reduce_reg(input)
|
537
538
|
else:
|
538
539
|
return self.all_reduce(input, registered=True)
|
@@ -541,7 +542,7 @@ class CustomAllreduce:
|
|
541
542
|
# allreduce is out-of-place.
|
542
543
|
return torch.empty_like(input)
|
543
544
|
else:
|
544
|
-
if
|
545
|
+
if _is_hip:
|
545
546
|
# note: outside of cuda graph context,
|
546
547
|
# custom allreduce incurs a cost of cudaMemcpy, which should
|
547
548
|
# be small(<=1% of overall latency) compared to the performance
|
@@ -556,7 +557,7 @@ class CustomAllreduce:
|
|
556
557
|
if ops.use_vllm_custom_allreduce:
|
557
558
|
self.free_shared_buffer(self.meta_ptrs)
|
558
559
|
self.free_shared_buffer(self.buffer_ptrs)
|
559
|
-
elif
|
560
|
+
elif _is_cuda:
|
560
561
|
self.free_shared_buffer(self.buffer_ptrs)
|
561
562
|
self.free_shared_buffer(self.tmp_result_buffer_ptrs)
|
562
563
|
self.free_shared_buffer(self.barrier_in_ptrs)
|
@@ -1228,7 +1228,16 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
|
1228
1228
|
ray.shutdown()
|
1229
1229
|
gc.collect()
|
1230
1230
|
if not current_platform.is_cpu():
|
1231
|
-
torch.cuda.
|
1231
|
+
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1232
|
+
torch.cuda.empty_cache()
|
1233
|
+
if hasattr(torch._C, "_host_emptyCache"):
|
1234
|
+
torch._C._host_emptyCache()
|
1235
|
+
else:
|
1236
|
+
logger.warning(
|
1237
|
+
"torch._C._host_emptyCache() only available in Pytorch >=2.5"
|
1238
|
+
)
|
1239
|
+
elif hasattr(torch, "xpu") and torch.xpu.is_available():
|
1240
|
+
torch.xpu.empty_cache()
|
1232
1241
|
|
1233
1242
|
|
1234
1243
|
def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -106,6 +106,8 @@ class Engine:
|
|
106
106
|
tokenizer_manager, scheduler_info = _launch_subprocesses(
|
107
107
|
server_args=server_args
|
108
108
|
)
|
109
|
+
|
110
|
+
self.server_args = server_args
|
109
111
|
self.tokenizer_manager = tokenizer_manager
|
110
112
|
self.scheduler_info = scheduler_info
|
111
113
|
|
@@ -214,13 +216,13 @@ class Engine:
|
|
214
216
|
def encode(
|
215
217
|
self,
|
216
218
|
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
219
|
+
image_data: Optional[Union[List[str], str]] = None,
|
217
220
|
) -> Dict:
|
218
221
|
"""
|
219
222
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
|
220
223
|
Please refer to `EmbeddingReqInput` for the documentation.
|
221
224
|
"""
|
222
|
-
|
223
|
-
obj = EmbeddingReqInput(text=prompt)
|
225
|
+
obj = EmbeddingReqInput(text=prompt, image_data=image_data)
|
224
226
|
loop = asyncio.get_event_loop()
|
225
227
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
226
228
|
ret = loop.run_until_complete(generator.__anext__())
|
@@ -374,7 +376,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
374
376
|
if server_args.attention_backend == "flashinfer":
|
375
377
|
assert_pkg_version(
|
376
378
|
"flashinfer_python",
|
377
|
-
"0.2.
|
379
|
+
"0.2.3",
|
378
380
|
"Please uninstall the old version and "
|
379
381
|
"reinstall the latest version by following the instructions "
|
380
382
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -614,7 +614,7 @@ def launch_server(
|
|
614
614
|
|
615
615
|
Note:
|
616
616
|
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
617
|
-
2. Inter-process communication is done through
|
617
|
+
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
618
618
|
"""
|
619
619
|
tokenizer_manager, scheduler_info = _launch_subprocesses(server_args=server_args)
|
620
620
|
set_global_state(
|
@@ -318,6 +318,10 @@ class Qwen25Detector(BaseFormatDetector):
|
|
318
318
|
self.bot_token = "<tool_call>"
|
319
319
|
self.eot_token = "</tool_call>"
|
320
320
|
|
321
|
+
def has_tool_call(self, text: str) -> bool:
|
322
|
+
"""Check if the text contains a Qwen 2.5 format tool call."""
|
323
|
+
return self.bot_token in text
|
324
|
+
|
321
325
|
def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
|
322
326
|
"""
|
323
327
|
One-time parsing: Detects and parses tool calls in the provided text.
|
@@ -352,6 +356,10 @@ class MistralDetector(BaseFormatDetector):
|
|
352
356
|
self.bot_token = "[TOOL_CALLS] ["
|
353
357
|
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
|
354
358
|
|
359
|
+
def has_tool_call(self, text: str) -> bool:
|
360
|
+
"""Check if the text contains a Mistral format tool call."""
|
361
|
+
return self.bot_token in text
|
362
|
+
|
355
363
|
def _clean_text(self, text: str) -> str:
|
356
364
|
"""
|
357
365
|
clean text to only leave ''[TOOL_CALLS] [{"name": xxx, "arguments": {xxx}}]'
|
@@ -397,12 +405,21 @@ class Llama32Detector(BaseFormatDetector):
|
|
397
405
|
super().__init__()
|
398
406
|
self.bot_token = "<|python_tag|>"
|
399
407
|
|
408
|
+
def has_tool_call(self, text: str) -> bool:
|
409
|
+
"""Check if the text contains a Llama 3.2 format tool call."""
|
410
|
+
# depending on the prompt format the Llama model may or may not
|
411
|
+
# prefix the output with the <|python_tag|> token
|
412
|
+
return "<|python_tag|>" in text or text.startswith("{")
|
413
|
+
|
400
414
|
def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
|
401
415
|
"""Parse function calls from text, handling multiple JSON objects."""
|
402
|
-
if "<|python_tag|>" not in text:
|
416
|
+
if "<|python_tag|>" not in text and not text.startswith("{"):
|
403
417
|
return []
|
404
418
|
|
405
|
-
|
419
|
+
if "<|python_tag|>" in text:
|
420
|
+
_, action_text = text.split("<|python_tag|>")
|
421
|
+
else:
|
422
|
+
action_text = text
|
406
423
|
|
407
424
|
# Split by semicolon and process each part
|
408
425
|
json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
|
@@ -501,6 +518,20 @@ class FunctionCallParser:
|
|
501
518
|
self.multi_format_parser = MultiFormatParser(detectors)
|
502
519
|
self.tools = tools
|
503
520
|
|
521
|
+
def has_tool_call(self, text: str) -> bool:
|
522
|
+
"""
|
523
|
+
Check if the given text contains a tool call in the format supported by this parser.
|
524
|
+
This delegates to the detector's implementation.
|
525
|
+
|
526
|
+
:param text: The text to check for tool calls
|
527
|
+
:return: True if the text contains a tool call, False otherwise
|
528
|
+
"""
|
529
|
+
# Check all detectors in the multi_format_parser
|
530
|
+
for detector in self.multi_format_parser.detectors:
|
531
|
+
if detector.has_tool_call(text):
|
532
|
+
return True
|
533
|
+
return False
|
534
|
+
|
504
535
|
def parse_non_stream(self, full_text: str):
|
505
536
|
"""
|
506
537
|
Non-streaming call: one-time parsing
|
@@ -30,13 +30,20 @@ from transformers import (
|
|
30
30
|
)
|
31
31
|
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
32
32
|
|
33
|
-
from sglang.srt.configs import
|
33
|
+
from sglang.srt.configs import (
|
34
|
+
ChatGLMConfig,
|
35
|
+
DbrxConfig,
|
36
|
+
ExaoneConfig,
|
37
|
+
MultiModalityConfig,
|
38
|
+
Qwen2_5_VLConfig,
|
39
|
+
)
|
34
40
|
|
35
41
|
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
36
42
|
ChatGLMConfig.model_type: ChatGLMConfig,
|
37
43
|
DbrxConfig.model_type: DbrxConfig,
|
38
44
|
ExaoneConfig.model_type: ExaoneConfig,
|
39
45
|
Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
|
46
|
+
MultiModalityConfig.model_type: MultiModalityConfig,
|
40
47
|
}
|
41
48
|
|
42
49
|
for name, cls in _CONFIG_REGISTRY.items():
|
@@ -66,6 +73,14 @@ def get_config(
|
|
66
73
|
config = AutoConfig.from_pretrained(
|
67
74
|
model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
|
68
75
|
)
|
76
|
+
|
77
|
+
# FIXME: Pour contents of janus-pro's langauge_config to first-level
|
78
|
+
if isinstance(model, str) and model.lower().startswith("deepseek-ai/janus-pro"):
|
79
|
+
assert hasattr(config, "language_config")
|
80
|
+
for key, val in config.language_config.__dict__.items():
|
81
|
+
setattr(config, key, val)
|
82
|
+
setattr(config, "architectures", ["MultiModalityCausalLM"])
|
83
|
+
|
69
84
|
if config.model_type in _CONFIG_REGISTRY:
|
70
85
|
config_class = _CONFIG_REGISTRY[config.model_type]
|
71
86
|
config = config_class.from_pretrained(model, revision=revision)
|
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
|
|
22
22
|
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
23
23
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
24
24
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
25
|
-
from sglang.srt.utils import is_flashinfer_available
|
25
|
+
from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
|
26
26
|
|
27
27
|
if TYPE_CHECKING:
|
28
28
|
from sglang.srt.layers.radix_attention import RadixAttention
|