sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/constrained/outlines_backend.py +9 -1
- sglang/srt/custom_op.py +40 -0
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/layers/activation.py +10 -5
- sglang/srt/layers/attention/flashinfer_backend.py +284 -39
- sglang/srt/layers/attention/triton_backend.py +71 -7
- sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
- sglang/srt/layers/attention/vision.py +243 -40
- sglang/srt/layers/layernorm.py +1 -5
- sglang/srt/layers/moe/ep_moe/layer.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -11
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
- sglang/srt/layers/moe/topk.py +4 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +7 -0
- sglang/srt/layers/quantization/fp8_kernel.py +140 -2
- sglang/srt/layers/rotary_embedding.py +29 -15
- sglang/srt/layers/sampler.py +9 -6
- sglang/srt/lora/backend/__init__.py +8 -0
- sglang/srt/lora/backend/base_backend.py +95 -0
- sglang/srt/lora/backend/flashinfer_backend.py +91 -0
- sglang/srt/lora/backend/triton_backend.py +61 -0
- sglang/srt/lora/lora.py +127 -112
- sglang/srt/lora/lora_manager.py +50 -18
- sglang/srt/lora/triton_ops/__init__.py +5 -0
- sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
- sglang/srt/managers/image_processor.py +77 -38
- sglang/srt/managers/scheduler.py +17 -3
- sglang/srt/mem_cache/base_prefix_cache.py +4 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -0
- sglang/srt/mem_cache/radix_cache.py +30 -1
- sglang/srt/model_executor/cuda_graph_runner.py +77 -80
- sglang/srt/model_executor/forward_batch_info.py +58 -59
- sglang/srt/model_executor/model_runner.py +2 -2
- sglang/srt/models/minicpmv.py +129 -76
- sglang/srt/models/mllama.py +16 -56
- sglang/srt/models/qwen2.py +4 -1
- sglang/srt/models/qwen2_vl.py +19 -9
- sglang/srt/server_args.py +19 -2
- sglang/srt/speculative/build_eagle_tree.py +4 -2
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
- sglang/srt/speculative/eagle_utils.py +361 -372
- sglang/srt/speculative/eagle_worker.py +177 -45
- sglang/srt/utils.py +7 -2
- sglang/test/runners.py +2 -0
- sglang/utils.py +42 -0
- sglang/version.py +1 -1
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/METADATA +16 -7
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/RECORD +84 -45
- sglang/srt/layers/custom_op_util.py +0 -25
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/LICENSE +0 -0
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/top_level.txt +0 -0
sglang/srt/models/minicpmv.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Adapted from
|
2
2
|
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
3
|
-
# Copyright 2023 The
|
3
|
+
# Copyright 2023 The SGLang team.
|
4
4
|
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
5
5
|
#
|
6
6
|
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
@@ -20,7 +20,7 @@
|
|
20
20
|
# See the License for the specific language governing permissions and
|
21
21
|
# limitations under the License.
|
22
22
|
"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
|
23
|
-
from functools import
|
23
|
+
from functools import partial
|
24
24
|
from typing import (
|
25
25
|
Any,
|
26
26
|
Callable,
|
@@ -33,16 +33,13 @@ from typing import (
|
|
33
33
|
Union,
|
34
34
|
)
|
35
35
|
|
36
|
+
import numpy as np
|
36
37
|
import torch
|
37
38
|
import torch.types
|
38
39
|
from PIL import Image
|
39
40
|
from torch import nn
|
40
41
|
from torch.nn.init import trunc_normal_
|
41
42
|
from transformers import PretrainedConfig
|
42
|
-
from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
|
43
|
-
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
44
|
-
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
45
|
-
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
46
43
|
|
47
44
|
from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
|
48
45
|
from sglang.srt.layers.activation import get_act_fn
|
@@ -63,6 +60,88 @@ from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
|
|
63
60
|
RawImageType = Union[Image.Image, torch.Tensor]
|
64
61
|
|
65
62
|
|
63
|
+
# sin/cos positional embedding helpers are adapted from:
|
64
|
+
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
|
65
|
+
def get_1d_sincos_pos_embed_from_grid(
|
66
|
+
embed_dim: int, pos: np.ndarray, version: Tuple[int, int] = (2, 0)
|
67
|
+
) -> torch.Tensor:
|
68
|
+
"""
|
69
|
+
embed_dim: output dimension for each position
|
70
|
+
pos: a list of positions to be encoded: size (M,) / (H, W)
|
71
|
+
out: (M, D) / (H, W, D)
|
72
|
+
"""
|
73
|
+
assert embed_dim % 2 == 0
|
74
|
+
omega = np.arange(embed_dim // 2, dtype=np.float32)
|
75
|
+
omega /= embed_dim / 2.0
|
76
|
+
omega = 1.0 / 10000**omega # (D/2,)
|
77
|
+
|
78
|
+
if version == (2, 0):
|
79
|
+
pos = pos.reshape(-1) # (M,)
|
80
|
+
out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
|
81
|
+
emb_sin = np.sin(out) # (M, D/2)
|
82
|
+
emb_cos = np.cos(out) # (M, D/2)
|
83
|
+
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
84
|
+
else:
|
85
|
+
out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product
|
86
|
+
emb_sin = np.sin(out) # (H, W, D/2)
|
87
|
+
emb_cos = np.cos(out) # (H, W, D/2)
|
88
|
+
emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D)
|
89
|
+
return emb
|
90
|
+
|
91
|
+
|
92
|
+
def get_2d_sincos_pos_embed_from_grid(
|
93
|
+
embed_dim: int, grid: np.ndarray, version: Tuple[int, int] = (2, 0)
|
94
|
+
) -> torch.Tensor:
|
95
|
+
assert embed_dim % 2 == 0
|
96
|
+
|
97
|
+
# use half of dimensions to encode grid_h
|
98
|
+
emb_h = get_1d_sincos_pos_embed_from_grid(
|
99
|
+
embed_dim // 2, grid[0], version
|
100
|
+
) # (H*W, D/2) or (H, W, D/2)
|
101
|
+
emb_w = get_1d_sincos_pos_embed_from_grid(
|
102
|
+
embed_dim // 2, grid[1], version
|
103
|
+
) # (H*W, D/2) or (H, W, D/2)
|
104
|
+
|
105
|
+
if version == (2, 0):
|
106
|
+
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
|
107
|
+
else:
|
108
|
+
emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D)
|
109
|
+
return emb
|
110
|
+
|
111
|
+
|
112
|
+
def get_2d_sincos_pos_embed(
|
113
|
+
embed_dim: int,
|
114
|
+
grid_size: Union[int, Tuple[int, int]],
|
115
|
+
cls_token: bool = False,
|
116
|
+
version: Tuple[int, int] = (2, 0),
|
117
|
+
) -> torch.Tensor:
|
118
|
+
"""
|
119
|
+
grid_size: int of the grid height and width
|
120
|
+
return:
|
121
|
+
pos_embed: [grid_size*grid_size, embed_dim] or
|
122
|
+
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
|
123
|
+
"""
|
124
|
+
if isinstance(grid_size, int):
|
125
|
+
grid_h_size, grid_w_size = grid_size, grid_size
|
126
|
+
else:
|
127
|
+
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
|
128
|
+
|
129
|
+
grid_h = np.arange(grid_h_size, dtype=np.float32)
|
130
|
+
grid_w = np.arange(grid_w_size, dtype=np.float32)
|
131
|
+
grid = np.meshgrid(grid_w, grid_h) # here w goes first
|
132
|
+
grid = np.stack(grid, axis=0)
|
133
|
+
assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
|
134
|
+
|
135
|
+
if version == (2, 0):
|
136
|
+
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
|
137
|
+
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
|
138
|
+
if cls_token:
|
139
|
+
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
|
140
|
+
else:
|
141
|
+
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
|
142
|
+
return pos_embed
|
143
|
+
|
144
|
+
|
66
145
|
class Idefics2VisionMLP(nn.Module):
|
67
146
|
|
68
147
|
def __init__(
|
@@ -116,6 +195,10 @@ class Idefics2EncoderLayer(nn.Module):
|
|
116
195
|
projection_size=config.intermediate_size,
|
117
196
|
use_qkv_parallel=True,
|
118
197
|
quant_config=quant_config,
|
198
|
+
dropout=config.attention_dropout,
|
199
|
+
use_context_forward=False,
|
200
|
+
use_full_precision_softmax=True,
|
201
|
+
flatten_batch=False,
|
119
202
|
prefix=f"{prefix}.self_attn",
|
120
203
|
)
|
121
204
|
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
@@ -126,7 +209,6 @@ class Idefics2EncoderLayer(nn.Module):
|
|
126
209
|
self,
|
127
210
|
hidden_states: torch.Tensor,
|
128
211
|
cu_seqlens: torch.Tensor,
|
129
|
-
forward_batch: ForwardBatch,
|
130
212
|
) -> torch.Tensor:
|
131
213
|
"""
|
132
214
|
Args:
|
@@ -136,11 +218,8 @@ class Idefics2EncoderLayer(nn.Module):
|
|
136
218
|
"""
|
137
219
|
residual = hidden_states
|
138
220
|
hidden_states = self.layer_norm1(hidden_states)
|
139
|
-
hidden_states = self.self_attn(
|
140
|
-
|
141
|
-
cu_seqlens=cu_seqlens,
|
142
|
-
# , forward_batch=forward_batch
|
143
|
-
)
|
221
|
+
hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
|
222
|
+
|
144
223
|
hidden_states = residual + hidden_states
|
145
224
|
residual = hidden_states
|
146
225
|
hidden_states = self.layer_norm2(hidden_states)
|
@@ -181,7 +260,6 @@ class Idefics2Encoder(nn.Module):
|
|
181
260
|
self,
|
182
261
|
inputs_embeds: torch.Tensor,
|
183
262
|
cu_seqlens: torch.Tensor,
|
184
|
-
forward_batch: ForwardBatch,
|
185
263
|
) -> torch.Tensor:
|
186
264
|
r"""
|
187
265
|
Args:
|
@@ -195,7 +273,8 @@ class Idefics2Encoder(nn.Module):
|
|
195
273
|
hidden_states = inputs_embeds
|
196
274
|
for encoder_layer in self.layers:
|
197
275
|
layer_outputs = encoder_layer(
|
198
|
-
hidden_states,
|
276
|
+
hidden_states,
|
277
|
+
cu_seqlens=cu_seqlens,
|
199
278
|
)
|
200
279
|
hidden_states = layer_outputs
|
201
280
|
return hidden_states
|
@@ -232,19 +311,14 @@ class Idefics2VisionEmbeddings(nn.Module):
|
|
232
311
|
self.num_positions = self.num_patches
|
233
312
|
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
234
313
|
|
235
|
-
def
|
314
|
+
def get_position_ids(
|
236
315
|
self,
|
237
316
|
pixel_values: torch.FloatTensor,
|
238
317
|
patch_attention_mask: torch.BoolTensor,
|
239
318
|
tgt_sizes: Optional[torch.IntTensor] = None,
|
240
|
-
)
|
319
|
+
):
|
241
320
|
batch_size, _, max_im_h, max_im_w = pixel_values.shape
|
242
|
-
|
243
|
-
pixel_values = pixel_values.to(
|
244
|
-
device=self.patch_embedding.weight.device, dtype=target_dtype
|
245
|
-
)
|
246
|
-
patch_embeds = self.patch_embedding(pixel_values)
|
247
|
-
embeddings = patch_embeds.flatten(2).transpose(1, 2)
|
321
|
+
|
248
322
|
max_nb_patches_h, max_nb_patches_w = (
|
249
323
|
max_im_h // self.patch_size,
|
250
324
|
max_im_w // self.patch_size,
|
@@ -277,6 +351,24 @@ class Idefics2VisionEmbeddings(nn.Module):
|
|
277
351
|
).flatten()
|
278
352
|
position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
|
279
353
|
position_ids = position_ids.to(self.position_embedding.weight.device)
|
354
|
+
return position_ids
|
355
|
+
|
356
|
+
def forward(
|
357
|
+
self,
|
358
|
+
pixel_values: torch.FloatTensor,
|
359
|
+
patch_attention_mask: torch.BoolTensor,
|
360
|
+
tgt_sizes: Optional[torch.IntTensor] = None,
|
361
|
+
) -> torch.Tensor:
|
362
|
+
target_dtype = self.patch_embedding.weight.dtype
|
363
|
+
pixel_values = pixel_values.to(
|
364
|
+
device=self.patch_embedding.weight.device, dtype=target_dtype
|
365
|
+
)
|
366
|
+
patch_embeds = self.patch_embedding(pixel_values)
|
367
|
+
embeddings = patch_embeds.flatten(2).transpose(1, 2)
|
368
|
+
position_ids = self.get_position_ids(
|
369
|
+
pixel_values, patch_attention_mask, tgt_sizes
|
370
|
+
)
|
371
|
+
|
280
372
|
embeddings = embeddings + self.position_embedding(position_ids)
|
281
373
|
return embeddings
|
282
374
|
|
@@ -287,7 +379,6 @@ class Idefics2VisionTransformer(nn.Module):
|
|
287
379
|
self,
|
288
380
|
config: PretrainedConfig,
|
289
381
|
quant_config: Optional[QuantizationConfig] = None,
|
290
|
-
prefix: str = "",
|
291
382
|
) -> None:
|
292
383
|
super().__init__()
|
293
384
|
|
@@ -302,8 +393,6 @@ class Idefics2VisionTransformer(nn.Module):
|
|
302
393
|
|
303
394
|
def compute_cu_seqlens(self, tgt_sizes: torch.Tensor) -> torch.Tensor:
|
304
395
|
patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] # shape: (batch_size,)
|
305
|
-
|
306
|
-
# 做 prefix sum 来得到 cu_seqlens,注意在最前面插一个 0 作为 offset
|
307
396
|
cu_seqlens = torch.cat(
|
308
397
|
[
|
309
398
|
torch.tensor([0], device=patch_len.device, dtype=torch.int32),
|
@@ -316,19 +405,18 @@ class Idefics2VisionTransformer(nn.Module):
|
|
316
405
|
def forward(
|
317
406
|
self,
|
318
407
|
pixel_values,
|
319
|
-
forward_batch: ForwardBatch,
|
320
408
|
patch_attention_mask: Optional[torch.BoolTensor] = None,
|
321
409
|
tgt_sizes: Optional[torch.IntTensor] = None,
|
322
410
|
) -> torch.Tensor:
|
323
411
|
hidden_states = self.embeddings(
|
324
412
|
pixel_values=pixel_values,
|
325
413
|
patch_attention_mask=patch_attention_mask,
|
326
|
-
# forward_batch=forward_batch,
|
327
414
|
tgt_sizes=tgt_sizes,
|
328
415
|
)
|
329
416
|
cu_seqlens = self.compute_cu_seqlens(tgt_sizes)
|
330
417
|
encoder_outputs = self.encoder(
|
331
|
-
hidden_states,
|
418
|
+
hidden_states,
|
419
|
+
cu_seqlens=cu_seqlens,
|
332
420
|
)
|
333
421
|
last_hidden_state = self.post_layernorm(encoder_outputs)
|
334
422
|
return last_hidden_state
|
@@ -573,14 +661,12 @@ class MiniCPMVBaseModel(nn.Module):
|
|
573
661
|
config: PretrainedConfig,
|
574
662
|
quant_config: Optional[QuantizationConfig] = None,
|
575
663
|
):
|
576
|
-
# multimodal_config = config.model_config.multimodal_config
|
577
664
|
super().__init__()
|
578
665
|
# All MiniCPM-V models disable `tie_word_embeddings` but
|
579
666
|
# `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
|
580
|
-
# check `tie_word_embeddings` until
|
667
|
+
# check `tie_word_embeddings` until SGLang integrate MiniCPM-V model
|
581
668
|
# and config class
|
582
669
|
self.config = config
|
583
|
-
# self.multimodal_config = multimodal_config
|
584
670
|
|
585
671
|
self.version = get_version_by_config(self.config)
|
586
672
|
self.llm = self.init_llm(config=config, quant_config=quant_config)
|
@@ -598,13 +684,6 @@ class MiniCPMVBaseModel(nn.Module):
|
|
598
684
|
|
599
685
|
self.logits_processor = LogitsProcessor(config)
|
600
686
|
|
601
|
-
@cached_property
|
602
|
-
def sampler(self):
|
603
|
-
if hasattr(self.llm, "sampler"):
|
604
|
-
return self.llm.sampler
|
605
|
-
|
606
|
-
return get_sampler()
|
607
|
-
|
608
687
|
def _get_image_bounds(
|
609
688
|
self,
|
610
689
|
input_ids: torch.Tensor,
|
@@ -666,7 +745,6 @@ class MiniCPMVBaseModel(nn.Module):
|
|
666
745
|
self,
|
667
746
|
input_ids: torch.Tensor,
|
668
747
|
image_inputs: Optional[MiniCPMVImageInputs],
|
669
|
-
forward_batch: ForwardBatch,
|
670
748
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
671
749
|
vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
|
672
750
|
|
@@ -680,10 +758,7 @@ class MiniCPMVBaseModel(nn.Module):
|
|
680
758
|
.to(vlm_embedding.device)
|
681
759
|
)
|
682
760
|
else:
|
683
|
-
vision_hidden_states = self.get_vision_hidden_states(
|
684
|
-
forward_batch, image_inputs
|
685
|
-
)
|
686
|
-
|
761
|
+
vision_hidden_states = self.get_vision_hidden_states(image_inputs)
|
687
762
|
# See NOTE in _parse_and_validate_inputs
|
688
763
|
image_bounds = image_inputs["image_bounds"]
|
689
764
|
if len(image_bounds) > 0:
|
@@ -693,6 +768,7 @@ class MiniCPMVBaseModel(nn.Module):
|
|
693
768
|
for start, end in image_bounds.tolist()
|
694
769
|
]
|
695
770
|
).to(vlm_embedding.device)
|
771
|
+
|
696
772
|
vlm_embedding.scatter_(
|
697
773
|
0,
|
698
774
|
image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
|
@@ -839,7 +915,7 @@ class MiniCPMVBaseModel(nn.Module):
|
|
839
915
|
# There values are useless because their embeddings will be replaced by vision embeddings anyway.
|
840
916
|
input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
|
841
917
|
|
842
|
-
vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs
|
918
|
+
vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
|
843
919
|
|
844
920
|
# always pass the input via `inputs_embeds`
|
845
921
|
# to make sure the computation graph is consistent
|
@@ -857,29 +933,6 @@ class MiniCPMVBaseModel(nn.Module):
|
|
857
933
|
input_ids, hidden_states, self.llm.lm_head, forward_batch
|
858
934
|
)
|
859
935
|
|
860
|
-
def compute_logits(
|
861
|
-
self,
|
862
|
-
hidden_states: torch.Tensor,
|
863
|
-
sampling_metadata: SamplingMetadata,
|
864
|
-
) -> Optional[torch.Tensor]:
|
865
|
-
return self.llm.compute_logits(hidden_states, sampling_metadata)
|
866
|
-
|
867
|
-
def sample(
|
868
|
-
self,
|
869
|
-
logits: torch.Tensor,
|
870
|
-
sampling_metadata: SamplingMetadata,
|
871
|
-
) -> Optional[SamplerOutput]:
|
872
|
-
next_tokens = self.sampler(logits, sampling_metadata)
|
873
|
-
return next_tokens
|
874
|
-
|
875
|
-
def get_mm_mapping(self) -> MultiModelKeys:
|
876
|
-
"""
|
877
|
-
Get the module prefix in multimodal models
|
878
|
-
"""
|
879
|
-
return MultiModelKeys.from_string_field(
|
880
|
-
language_model="llm", connector="resampler", tower_model="vpm"
|
881
|
-
)
|
882
|
-
|
883
936
|
def init_llm(
|
884
937
|
self,
|
885
938
|
config: Qwen2Config,
|
@@ -910,9 +963,7 @@ class MiniCPMVBaseModel(nn.Module):
|
|
910
963
|
) -> torch.Tensor:
|
911
964
|
raise NotImplementedError
|
912
965
|
|
913
|
-
def get_vision_hidden_states(
|
914
|
-
self, forward_batch: ForwardBatch, data: MiniCPMVImageInputs
|
915
|
-
) -> torch.Tensor:
|
966
|
+
def get_vision_hidden_states(self, data: MiniCPMVImageInputs) -> torch.Tensor:
|
916
967
|
raise NotImplementedError
|
917
968
|
|
918
969
|
|
@@ -1019,7 +1070,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
|
|
1019
1070
|
|
1020
1071
|
def get_vision_hidden_states(
|
1021
1072
|
self,
|
1022
|
-
forward_batch: ForwardBatch,
|
1023
1073
|
data: MiniCPMVImageInputs,
|
1024
1074
|
) -> torch.Tensor:
|
1025
1075
|
pixel_values = data["data"]
|
@@ -1042,15 +1092,18 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
|
|
1042
1092
|
patch_attn_mask = torch.zeros(
|
1043
1093
|
(B, 1, max_patches), dtype=torch.bool, device=device
|
1044
1094
|
)
|
1045
|
-
|
1046
|
-
|
1095
|
+
|
1096
|
+
tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
|
1097
|
+
mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
|
1098
|
+
patch_attn_mask[:, 0, :] = torch.arange(
|
1099
|
+
patch_attn_mask.size(2), device=patch_attn_mask.device
|
1100
|
+
).unsqueeze(0) < mask_shapes.unsqueeze(1)
|
1101
|
+
|
1047
1102
|
vision_embedding = self.vpm(
|
1048
1103
|
all_pixel_values.type(dtype),
|
1049
|
-
forward_batch=forward_batch,
|
1050
1104
|
patch_attention_mask=patch_attn_mask,
|
1051
1105
|
tgt_sizes=tgt_sizes,
|
1052
1106
|
)
|
1053
|
-
|
1054
1107
|
return self.resampler(vision_embedding, tgt_sizes)
|
1055
1108
|
|
1056
1109
|
def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
|
@@ -1138,7 +1191,7 @@ class MiniCPMV:
|
|
1138
1191
|
"""
|
1139
1192
|
Different versions of MiniCPMV use different visual encoders and LLMs,
|
1140
1193
|
which is not conducive to the current integration logic of LoRA and
|
1141
|
-
bitsandbytes in
|
1194
|
+
bitsandbytes in SGLang. Therefore, it is necessary to separate them.
|
1142
1195
|
"""
|
1143
1196
|
|
1144
1197
|
# Ensure that the LoRA support check passes when the class is not
|
sglang/srt/models/mllama.py
CHANGED
@@ -17,6 +17,7 @@ from transformers.models.mllama.modeling_mllama import (
|
|
17
17
|
import sglang.srt.distributed.parallel_state as ps
|
18
18
|
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
19
19
|
from sglang.srt.layers.activation import get_act_fn
|
20
|
+
from sglang.srt.layers.attention.vision import VisionAttention
|
20
21
|
from sglang.srt.layers.layernorm import RMSNorm
|
21
22
|
from sglang.srt.layers.linear import (
|
22
23
|
ColumnParallelLinear,
|
@@ -145,61 +146,6 @@ class MllamaPrecomputedPositionEmbedding(nn.Module):
|
|
145
146
|
return hidden_state
|
146
147
|
|
147
148
|
|
148
|
-
class MllamaVisionSdpaAttention(nn.Module):
|
149
|
-
def __init__(self, config: config_mllama.MllamaVisionConfig):
|
150
|
-
super().__init__()
|
151
|
-
|
152
|
-
model_parallel_size = get_tensor_model_parallel_world_size()
|
153
|
-
self.embed_dim = config.hidden_size
|
154
|
-
self.num_heads = config.attention_heads
|
155
|
-
self.head_dim = config.hidden_size // config.attention_heads
|
156
|
-
self.num_local_heads = self.num_heads // model_parallel_size
|
157
|
-
self.q_size = self.num_local_heads * self.head_dim
|
158
|
-
self.kv_size = self.num_local_heads * self.head_dim
|
159
|
-
|
160
|
-
self.qkv_proj = QKVParallelLinear(
|
161
|
-
self.embed_dim,
|
162
|
-
self.head_dim,
|
163
|
-
self.num_heads,
|
164
|
-
bias=False,
|
165
|
-
)
|
166
|
-
self.o_proj = RowParallelLinear(
|
167
|
-
self.num_heads * self.head_dim,
|
168
|
-
self.embed_dim,
|
169
|
-
bias=False,
|
170
|
-
input_is_parallel=True,
|
171
|
-
)
|
172
|
-
|
173
|
-
def forward(
|
174
|
-
self,
|
175
|
-
hidden_state: torch.Tensor,
|
176
|
-
attention_mask: Optional[torch.Tensor] = None,
|
177
|
-
) -> torch.Tensor:
|
178
|
-
qkv, _ = self.qkv_proj(hidden_state)
|
179
|
-
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
180
|
-
q = q.view(
|
181
|
-
q.shape[0], q.shape[1], self.num_local_heads, self.head_dim
|
182
|
-
).transpose(1, 2)
|
183
|
-
k = k.view(
|
184
|
-
k.shape[0], k.shape[1], self.num_local_heads, self.head_dim
|
185
|
-
).transpose(1, 2)
|
186
|
-
v = v.view(
|
187
|
-
v.shape[0], v.shape[1], self.num_local_heads, self.head_dim
|
188
|
-
).transpose(1, 2)
|
189
|
-
|
190
|
-
# TODO: remove padding in image encoder
|
191
|
-
attn_output = F.scaled_dot_product_attention(
|
192
|
-
q, k, v, attn_mask=attention_mask, dropout_p=0.0
|
193
|
-
)
|
194
|
-
|
195
|
-
attn_output = attn_output.transpose(1, 2).contiguous()
|
196
|
-
attn_output = attn_output.reshape(
|
197
|
-
attn_output.shape[0], attn_output.shape[1], -1
|
198
|
-
)
|
199
|
-
output, _ = self.o_proj(attn_output)
|
200
|
-
return output
|
201
|
-
|
202
|
-
|
203
149
|
class MllamaVisionMLP(nn.Module):
|
204
150
|
def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
|
205
151
|
super().__init__()
|
@@ -237,7 +183,17 @@ class MllamaVisionEncoderLayer(nn.Module):
|
|
237
183
|
self.is_gated = is_gated
|
238
184
|
self.intermediate_size = config.intermediate_size
|
239
185
|
|
240
|
-
self.self_attn =
|
186
|
+
self.self_attn = VisionAttention(
|
187
|
+
self.hidden_size,
|
188
|
+
self.num_attention_heads,
|
189
|
+
self.hidden_size,
|
190
|
+
use_qkv_parallel=True,
|
191
|
+
quant_config=None,
|
192
|
+
dropout=0.0,
|
193
|
+
use_context_forward=False,
|
194
|
+
use_full_precision_softmax=False,
|
195
|
+
flatten_batch=False,
|
196
|
+
)
|
241
197
|
self.mlp = MllamaVisionMLP(config)
|
242
198
|
|
243
199
|
self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps)
|
@@ -992,6 +948,10 @@ class MllamaForConditionalGeneration(nn.Module):
|
|
992
948
|
weight_loader(param, loaded_weight, shard_id)
|
993
949
|
break
|
994
950
|
else:
|
951
|
+
if "vision_model" in name:
|
952
|
+
# adapt to VisionAttention
|
953
|
+
name = name.replace("self_attn.o_proj", "self_attn.proj")
|
954
|
+
|
995
955
|
param = params_dict.pop(name)
|
996
956
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
997
957
|
weight_loader(param, loaded_weight)
|
sglang/srt/models/qwen2.py
CHANGED
@@ -249,7 +249,10 @@ class Qwen2Model(nn.Module):
|
|
249
249
|
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
250
250
|
|
251
251
|
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
252
|
-
|
252
|
+
if hasattr(self.config, "scale_emb"):
|
253
|
+
return self.embed_tokens(input_ids) * self.config.scale_emb
|
254
|
+
else:
|
255
|
+
return self.embed_tokens(input_ids)
|
253
256
|
|
254
257
|
def forward(
|
255
258
|
self,
|
sglang/srt/models/qwen2_vl.py
CHANGED
@@ -30,13 +30,11 @@ import numpy as np
|
|
30
30
|
import torch
|
31
31
|
import torch.nn as nn
|
32
32
|
import torch.nn.functional as F
|
33
|
-
from einops import rearrange
|
34
|
-
from vllm.model_executor.layers.activation import QuickGELU
|
33
|
+
from einops import rearrange
|
35
34
|
|
36
35
|
from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
|
37
|
-
from sglang.srt.distributed import parallel_state
|
38
|
-
from sglang.srt.distributed import utils as dist_utils
|
39
36
|
from sglang.srt.hf_transformers_utils import get_processor
|
37
|
+
from sglang.srt.layers.activation import QuickGELU
|
40
38
|
from sglang.srt.layers.attention.vision import VisionAttention
|
41
39
|
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
42
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
@@ -118,6 +116,7 @@ class Qwen2VisionBlock(nn.Module):
|
|
118
116
|
mlp_ratio: float,
|
119
117
|
act_layer: Type[nn.Module] = QuickGELU,
|
120
118
|
norm_layer: Type[nn.Module] = None,
|
119
|
+
attn_implementation: Optional[str] = "sdpa",
|
121
120
|
quant_config: Optional[QuantizationConfig] = None,
|
122
121
|
) -> None:
|
123
122
|
super().__init__()
|
@@ -126,12 +125,24 @@ class Qwen2VisionBlock(nn.Module):
|
|
126
125
|
self.norm1 = norm_layer(dim)
|
127
126
|
self.norm2 = norm_layer(dim)
|
128
127
|
mlp_hidden_dim = int(dim * mlp_ratio)
|
128
|
+
if attn_implementation == "sdpa":
|
129
|
+
use_context_forward = False
|
130
|
+
use_full_precision_softmax = False
|
131
|
+
elif attn_implementation == "flash_attention_2":
|
132
|
+
use_full_precision_softmax = False
|
133
|
+
use_context_forward = True
|
134
|
+
elif attn_implementation == "eager":
|
135
|
+
use_full_precision_softmax = True
|
136
|
+
use_context_forward = False
|
129
137
|
|
130
138
|
self.attn = VisionAttention(
|
131
139
|
embed_dim=dim,
|
132
140
|
num_heads=num_heads,
|
133
141
|
projection_size=dim,
|
134
142
|
use_qkv_parallel=False,
|
143
|
+
use_context_forward=use_context_forward,
|
144
|
+
use_full_precision_softmax=use_full_precision_softmax,
|
145
|
+
flatten_batch=True,
|
135
146
|
quant_config=quant_config,
|
136
147
|
)
|
137
148
|
self.mlp = Qwen2VisionMLP(
|
@@ -286,7 +297,6 @@ class Qwen2VisionTransformer(nn.Module):
|
|
286
297
|
norm_layer = partial(nn.LayerNorm, eps=norm_eps)
|
287
298
|
head_dim = embed_dim // num_heads
|
288
299
|
self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
|
289
|
-
|
290
300
|
self.blocks = nn.ModuleList(
|
291
301
|
[
|
292
302
|
Qwen2VisionBlock(
|
@@ -294,6 +304,7 @@ class Qwen2VisionTransformer(nn.Module):
|
|
294
304
|
num_heads=num_heads,
|
295
305
|
mlp_ratio=mlp_ratio,
|
296
306
|
norm_layer=norm_layer,
|
307
|
+
attn_implementation="sdpa",
|
297
308
|
quant_config=quant_config,
|
298
309
|
)
|
299
310
|
for _ in range(depth)
|
@@ -482,10 +493,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|
482
493
|
opensource models), the shape will be `(3, seq_len)`,
|
483
494
|
otherwise it will be `(seq_len,).
|
484
495
|
(Use input_metadata.mrope_positions to replace it)
|
485
|
-
pixel_values: Pixel values to be fed to a model.
|
486
|
-
`None` if no images are passed.
|
487
|
-
image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
|
488
|
-
`None` if no images are passed.
|
489
496
|
"""
|
490
497
|
if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
|
491
498
|
positions = forward_batch.mrope_positions
|
@@ -540,15 +547,18 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|
540
547
|
num_image_tokens = self.calculate_num_image_tokens(
|
541
548
|
image_grid_thws[idx]
|
542
549
|
)
|
550
|
+
|
543
551
|
left_idx = start_idx + (image_offset - prefix_len)
|
544
552
|
right_idx = (
|
545
553
|
start_idx + (image_offset - prefix_len) + num_image_tokens
|
546
554
|
)
|
555
|
+
|
547
556
|
inputs_embeds[left_idx:right_idx] = image_embeds[
|
548
557
|
image_embeds_offset : image_embeds_offset + num_image_tokens
|
549
558
|
]
|
550
559
|
image_embeds_offset += num_image_tokens
|
551
560
|
|
561
|
+
input_ids = None
|
552
562
|
hidden_states = self.model(
|
553
563
|
input_ids=input_ids,
|
554
564
|
positions=positions,
|
sglang/srt/server_args.py
CHANGED
@@ -113,6 +113,7 @@ class ServerArgs:
|
|
113
113
|
# LoRA
|
114
114
|
lora_paths: Optional[List[str]] = None
|
115
115
|
max_loras_per_batch: int = 8
|
116
|
+
lora_backend: str = "triton"
|
116
117
|
|
117
118
|
# Kernel backend
|
118
119
|
attention_backend: Optional[str] = None
|
@@ -163,6 +164,7 @@ class ServerArgs:
|
|
163
164
|
# Custom logit processor
|
164
165
|
enable_custom_logit_processor: bool = False
|
165
166
|
tool_call_parser: str = None
|
167
|
+
enable_hierarchical_cache: bool = False
|
166
168
|
|
167
169
|
def __post_init__(self):
|
168
170
|
# Set missing default values
|
@@ -272,6 +274,10 @@ class ServerArgs:
|
|
272
274
|
) and check_gguf_file(self.model_path):
|
273
275
|
self.quantization = self.load_format = "gguf"
|
274
276
|
|
277
|
+
# AMD-specific Triton attention KV splits default number
|
278
|
+
if is_hip():
|
279
|
+
self.triton_attention_num_kv_splits = 16
|
280
|
+
|
275
281
|
@staticmethod
|
276
282
|
def add_cli_args(parser: argparse.ArgumentParser):
|
277
283
|
# Model and port args
|
@@ -648,13 +654,19 @@ class ServerArgs:
|
|
648
654
|
nargs="*",
|
649
655
|
default=None,
|
650
656
|
action=LoRAPathAction,
|
651
|
-
help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
|
657
|
+
help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}.",
|
652
658
|
)
|
653
659
|
parser.add_argument(
|
654
660
|
"--max-loras-per-batch",
|
655
661
|
type=int,
|
656
662
|
default=8,
|
657
|
-
help="Maximum number of adapters for a running batch, include base-only request",
|
663
|
+
help="Maximum number of adapters for a running batch, include base-only request.",
|
664
|
+
)
|
665
|
+
parser.add_argument(
|
666
|
+
"--lora-backend",
|
667
|
+
type=str,
|
668
|
+
default="triton",
|
669
|
+
help="Choose the kernel backend for multi-LoRA serving.",
|
658
670
|
)
|
659
671
|
|
660
672
|
# Kernel backend
|
@@ -892,6 +904,11 @@ class ServerArgs:
|
|
892
904
|
default=ServerArgs.tool_call_parser,
|
893
905
|
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
|
894
906
|
)
|
907
|
+
parser.add_argument(
|
908
|
+
"--enable-hierarchical-cache",
|
909
|
+
action="store_true",
|
910
|
+
help="Enable hierarchical cache",
|
911
|
+
)
|
895
912
|
|
896
913
|
@classmethod
|
897
914
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -79,11 +79,13 @@ __global__ void build_tree(Tensor<long, 2> parent_list, Tensor<long, 2> selected
|
|
79
79
|
)
|
80
80
|
|
81
81
|
|
82
|
-
def build_tree_kernel(
|
82
|
+
def build_tree_kernel(
|
83
|
+
parent_list, top_score_index, seq_lens, seq_lens_sum, topk, depth, draft_token
|
84
|
+
):
|
83
85
|
bs = seq_lens.numel()
|
84
86
|
device = parent_list.device
|
85
87
|
tree_mask = torch.full(
|
86
|
-
(
|
88
|
+
(seq_lens_sum * draft_token + draft_token * draft_token * bs,),
|
87
89
|
True,
|
88
90
|
device=device,
|
89
91
|
)
|