frontveg 0.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- frontveg/__init__.py +11 -0
- frontveg/_tests/__init__.py +0 -0
- frontveg/_tests/test_widget.py +66 -0
- frontveg/_version.py +21 -0
- frontveg/_widget.py +132 -0
- frontveg/napari.yaml +14 -0
- frontveg/utils.py +95 -0
- frontveg-0.1.dev1.dist-info/METADATA +143 -0
- frontveg-0.1.dev1.dist-info/RECORD +44 -0
- frontveg-0.1.dev1.dist-info/WHEEL +5 -0
- frontveg-0.1.dev1.dist-info/entry_points.txt +2 -0
- frontveg-0.1.dev1.dist-info/licenses/LICENSE +28 -0
- frontveg-0.1.dev1.dist-info/top_level.txt +2 -0
- sam2/__init__.py +11 -0
- sam2/automatic_mask_generator.py +454 -0
- sam2/build_sam.py +167 -0
- sam2/configs/sam2/sam2_hiera_b+.yaml +113 -0
- sam2/configs/sam2/sam2_hiera_l.yaml +117 -0
- sam2/configs/sam2/sam2_hiera_s.yaml +116 -0
- sam2/configs/sam2/sam2_hiera_t.yaml +118 -0
- sam2/modeling/__init__.py +5 -0
- sam2/modeling/backbones/__init__.py +5 -0
- sam2/modeling/backbones/hieradet.py +317 -0
- sam2/modeling/backbones/image_encoder.py +134 -0
- sam2/modeling/backbones/utils.py +95 -0
- sam2/modeling/memory_attention.py +169 -0
- sam2/modeling/memory_encoder.py +181 -0
- sam2/modeling/position_encoding.py +221 -0
- sam2/modeling/sam/__init__.py +5 -0
- sam2/modeling/sam/mask_decoder.py +295 -0
- sam2/modeling/sam/prompt_encoder.py +182 -0
- sam2/modeling/sam/transformer.py +360 -0
- sam2/modeling/sam2_base.py +907 -0
- sam2/modeling/sam2_utils.py +323 -0
- sam2/sam2_hiera_b+.yaml +1 -0
- sam2/sam2_hiera_l.yaml +1 -0
- sam2/sam2_hiera_s.yaml +1 -0
- sam2/sam2_hiera_t.yaml +1 -0
- sam2/sam2_image_predictor.py +466 -0
- sam2/sam2_video_predictor.py +1172 -0
- sam2/utils/__init__.py +5 -0
- sam2/utils/amg.py +348 -0
- sam2/utils/misc.py +349 -0
- sam2/utils/transforms.py +118 -0
@@ -0,0 +1,169 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
|
4
|
+
# This source code is licensed under the license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
from typing import Optional
|
8
|
+
|
9
|
+
import torch
|
10
|
+
from torch import nn, Tensor
|
11
|
+
|
12
|
+
from sam2.modeling.sam.transformer import RoPEAttention
|
13
|
+
|
14
|
+
from sam2.modeling.sam2_utils import get_activation_fn, get_clones
|
15
|
+
|
16
|
+
|
17
|
+
class MemoryAttentionLayer(nn.Module):
|
18
|
+
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
activation: str,
|
22
|
+
cross_attention: nn.Module,
|
23
|
+
d_model: int,
|
24
|
+
dim_feedforward: int,
|
25
|
+
dropout: float,
|
26
|
+
pos_enc_at_attn: bool,
|
27
|
+
pos_enc_at_cross_attn_keys: bool,
|
28
|
+
pos_enc_at_cross_attn_queries: bool,
|
29
|
+
self_attention: nn.Module,
|
30
|
+
):
|
31
|
+
super().__init__()
|
32
|
+
self.d_model = d_model
|
33
|
+
self.dim_feedforward = dim_feedforward
|
34
|
+
self.dropout_value = dropout
|
35
|
+
self.self_attn = self_attention
|
36
|
+
self.cross_attn_image = cross_attention
|
37
|
+
|
38
|
+
# Implementation of Feedforward model
|
39
|
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
40
|
+
self.dropout = nn.Dropout(dropout)
|
41
|
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
42
|
+
|
43
|
+
self.norm1 = nn.LayerNorm(d_model)
|
44
|
+
self.norm2 = nn.LayerNorm(d_model)
|
45
|
+
self.norm3 = nn.LayerNorm(d_model)
|
46
|
+
self.dropout1 = nn.Dropout(dropout)
|
47
|
+
self.dropout2 = nn.Dropout(dropout)
|
48
|
+
self.dropout3 = nn.Dropout(dropout)
|
49
|
+
|
50
|
+
self.activation_str = activation
|
51
|
+
self.activation = get_activation_fn(activation)
|
52
|
+
|
53
|
+
# Where to add pos enc
|
54
|
+
self.pos_enc_at_attn = pos_enc_at_attn
|
55
|
+
self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
|
56
|
+
self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
|
57
|
+
|
58
|
+
def _forward_sa(self, tgt, query_pos):
|
59
|
+
# Self-Attention
|
60
|
+
tgt2 = self.norm1(tgt)
|
61
|
+
q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
|
62
|
+
tgt2 = self.self_attn(q, k, v=tgt2)
|
63
|
+
tgt = tgt + self.dropout1(tgt2)
|
64
|
+
return tgt
|
65
|
+
|
66
|
+
def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
|
67
|
+
kwds = {}
|
68
|
+
if num_k_exclude_rope > 0:
|
69
|
+
assert isinstance(self.cross_attn_image, RoPEAttention)
|
70
|
+
kwds = {"num_k_exclude_rope": num_k_exclude_rope}
|
71
|
+
|
72
|
+
# Cross-Attention
|
73
|
+
tgt2 = self.norm2(tgt)
|
74
|
+
tgt2 = self.cross_attn_image(
|
75
|
+
q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
|
76
|
+
k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
|
77
|
+
v=memory,
|
78
|
+
**kwds,
|
79
|
+
)
|
80
|
+
tgt = tgt + self.dropout2(tgt2)
|
81
|
+
return tgt
|
82
|
+
|
83
|
+
def forward(
|
84
|
+
self,
|
85
|
+
tgt,
|
86
|
+
memory,
|
87
|
+
pos: Optional[Tensor] = None,
|
88
|
+
query_pos: Optional[Tensor] = None,
|
89
|
+
num_k_exclude_rope: int = 0,
|
90
|
+
) -> torch.Tensor:
|
91
|
+
|
92
|
+
# Self-Attn, Cross-Attn
|
93
|
+
tgt = self._forward_sa(tgt, query_pos)
|
94
|
+
tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
|
95
|
+
# MLP
|
96
|
+
tgt2 = self.norm3(tgt)
|
97
|
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
|
98
|
+
tgt = tgt + self.dropout3(tgt2)
|
99
|
+
return tgt
|
100
|
+
|
101
|
+
|
102
|
+
class MemoryAttention(nn.Module):
|
103
|
+
def __init__(
|
104
|
+
self,
|
105
|
+
d_model: int,
|
106
|
+
pos_enc_at_input: bool,
|
107
|
+
layer: nn.Module,
|
108
|
+
num_layers: int,
|
109
|
+
batch_first: bool = True, # Do layers expect batch first input?
|
110
|
+
):
|
111
|
+
super().__init__()
|
112
|
+
self.d_model = d_model
|
113
|
+
self.layers = get_clones(layer, num_layers)
|
114
|
+
self.num_layers = num_layers
|
115
|
+
self.norm = nn.LayerNorm(d_model)
|
116
|
+
self.pos_enc_at_input = pos_enc_at_input
|
117
|
+
self.batch_first = batch_first
|
118
|
+
|
119
|
+
def forward(
|
120
|
+
self,
|
121
|
+
curr: torch.Tensor, # self-attention inputs
|
122
|
+
memory: torch.Tensor, # cross-attention inputs
|
123
|
+
curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs
|
124
|
+
memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs
|
125
|
+
num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
|
126
|
+
):
|
127
|
+
if isinstance(curr, list):
|
128
|
+
assert isinstance(curr_pos, list)
|
129
|
+
assert len(curr) == len(curr_pos) == 1
|
130
|
+
curr, curr_pos = (
|
131
|
+
curr[0],
|
132
|
+
curr_pos[0],
|
133
|
+
)
|
134
|
+
|
135
|
+
assert (
|
136
|
+
curr.shape[1] == memory.shape[1]
|
137
|
+
), "Batch size must be the same for curr and memory"
|
138
|
+
|
139
|
+
output = curr
|
140
|
+
if self.pos_enc_at_input and curr_pos is not None:
|
141
|
+
output = output + 0.1 * curr_pos
|
142
|
+
|
143
|
+
if self.batch_first:
|
144
|
+
# Convert to batch first
|
145
|
+
output = output.transpose(0, 1)
|
146
|
+
curr_pos = curr_pos.transpose(0, 1)
|
147
|
+
memory = memory.transpose(0, 1)
|
148
|
+
memory_pos = memory_pos.transpose(0, 1)
|
149
|
+
|
150
|
+
for layer in self.layers:
|
151
|
+
kwds = {}
|
152
|
+
if isinstance(layer.cross_attn_image, RoPEAttention):
|
153
|
+
kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
|
154
|
+
|
155
|
+
output = layer(
|
156
|
+
tgt=output,
|
157
|
+
memory=memory,
|
158
|
+
pos=memory_pos,
|
159
|
+
query_pos=curr_pos,
|
160
|
+
**kwds,
|
161
|
+
)
|
162
|
+
normed_output = self.norm(output)
|
163
|
+
|
164
|
+
if self.batch_first:
|
165
|
+
# Convert back to seq first
|
166
|
+
normed_output = normed_output.transpose(0, 1)
|
167
|
+
curr_pos = curr_pos.transpose(0, 1)
|
168
|
+
|
169
|
+
return normed_output
|
@@ -0,0 +1,181 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
|
4
|
+
# This source code is licensed under the license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import math
|
8
|
+
from typing import Tuple
|
9
|
+
|
10
|
+
import torch
|
11
|
+
import torch.nn as nn
|
12
|
+
import torch.nn.functional as F
|
13
|
+
|
14
|
+
from sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d
|
15
|
+
|
16
|
+
|
17
|
+
class MaskDownSampler(nn.Module):
|
18
|
+
"""
|
19
|
+
Progressively downsample a mask by total_stride, each time by stride.
|
20
|
+
Note that LayerNorm is applied per *token*, like in ViT.
|
21
|
+
|
22
|
+
With each downsample (by a factor stride**2), channel capacity increases by the same factor.
|
23
|
+
In the end, we linearly project to embed_dim channels.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
embed_dim=256,
|
29
|
+
kernel_size=4,
|
30
|
+
stride=4,
|
31
|
+
padding=0,
|
32
|
+
total_stride=16,
|
33
|
+
activation=nn.GELU,
|
34
|
+
):
|
35
|
+
super().__init__()
|
36
|
+
num_layers = int(math.log2(total_stride) // math.log2(stride))
|
37
|
+
assert stride**num_layers == total_stride
|
38
|
+
self.encoder = nn.Sequential()
|
39
|
+
mask_in_chans, mask_out_chans = 1, 1
|
40
|
+
for _ in range(num_layers):
|
41
|
+
mask_out_chans = mask_in_chans * (stride**2)
|
42
|
+
self.encoder.append(
|
43
|
+
nn.Conv2d(
|
44
|
+
mask_in_chans,
|
45
|
+
mask_out_chans,
|
46
|
+
kernel_size=kernel_size,
|
47
|
+
stride=stride,
|
48
|
+
padding=padding,
|
49
|
+
)
|
50
|
+
)
|
51
|
+
self.encoder.append(LayerNorm2d(mask_out_chans))
|
52
|
+
self.encoder.append(activation())
|
53
|
+
mask_in_chans = mask_out_chans
|
54
|
+
|
55
|
+
self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
|
56
|
+
|
57
|
+
def forward(self, x):
|
58
|
+
return self.encoder(x)
|
59
|
+
|
60
|
+
|
61
|
+
# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
|
62
|
+
class CXBlock(nn.Module):
|
63
|
+
r"""ConvNeXt Block. There are two equivalent implementations:
|
64
|
+
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
|
65
|
+
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
|
66
|
+
We use (2) as we find it slightly faster in PyTorch
|
67
|
+
|
68
|
+
Args:
|
69
|
+
dim (int): Number of input channels.
|
70
|
+
drop_path (float): Stochastic depth rate. Default: 0.0
|
71
|
+
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
|
72
|
+
"""
|
73
|
+
|
74
|
+
def __init__(
|
75
|
+
self,
|
76
|
+
dim,
|
77
|
+
kernel_size=7,
|
78
|
+
padding=3,
|
79
|
+
drop_path=0.0,
|
80
|
+
layer_scale_init_value=1e-6,
|
81
|
+
use_dwconv=True,
|
82
|
+
):
|
83
|
+
super().__init__()
|
84
|
+
self.dwconv = nn.Conv2d(
|
85
|
+
dim,
|
86
|
+
dim,
|
87
|
+
kernel_size=kernel_size,
|
88
|
+
padding=padding,
|
89
|
+
groups=dim if use_dwconv else 1,
|
90
|
+
) # depthwise conv
|
91
|
+
self.norm = LayerNorm2d(dim, eps=1e-6)
|
92
|
+
self.pwconv1 = nn.Linear(
|
93
|
+
dim, 4 * dim
|
94
|
+
) # pointwise/1x1 convs, implemented with linear layers
|
95
|
+
self.act = nn.GELU()
|
96
|
+
self.pwconv2 = nn.Linear(4 * dim, dim)
|
97
|
+
self.gamma = (
|
98
|
+
nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
|
99
|
+
if layer_scale_init_value > 0
|
100
|
+
else None
|
101
|
+
)
|
102
|
+
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
103
|
+
|
104
|
+
def forward(self, x):
|
105
|
+
input = x
|
106
|
+
x = self.dwconv(x)
|
107
|
+
x = self.norm(x)
|
108
|
+
x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
|
109
|
+
x = self.pwconv1(x)
|
110
|
+
x = self.act(x)
|
111
|
+
x = self.pwconv2(x)
|
112
|
+
if self.gamma is not None:
|
113
|
+
x = self.gamma * x
|
114
|
+
x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
|
115
|
+
|
116
|
+
x = input + self.drop_path(x)
|
117
|
+
return x
|
118
|
+
|
119
|
+
|
120
|
+
class Fuser(nn.Module):
|
121
|
+
def __init__(self, layer, num_layers, dim=None, input_projection=False):
|
122
|
+
super().__init__()
|
123
|
+
self.proj = nn.Identity()
|
124
|
+
self.layers = get_clones(layer, num_layers)
|
125
|
+
|
126
|
+
if input_projection:
|
127
|
+
assert dim is not None
|
128
|
+
self.proj = nn.Conv2d(dim, dim, kernel_size=1)
|
129
|
+
|
130
|
+
def forward(self, x):
|
131
|
+
# normally x: (N, C, H, W)
|
132
|
+
x = self.proj(x)
|
133
|
+
for layer in self.layers:
|
134
|
+
x = layer(x)
|
135
|
+
return x
|
136
|
+
|
137
|
+
|
138
|
+
class MemoryEncoder(nn.Module):
|
139
|
+
def __init__(
|
140
|
+
self,
|
141
|
+
out_dim,
|
142
|
+
mask_downsampler,
|
143
|
+
fuser,
|
144
|
+
position_encoding,
|
145
|
+
in_dim=256, # in_dim of pix_feats
|
146
|
+
):
|
147
|
+
super().__init__()
|
148
|
+
|
149
|
+
self.mask_downsampler = mask_downsampler
|
150
|
+
|
151
|
+
self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
|
152
|
+
self.fuser = fuser
|
153
|
+
self.position_encoding = position_encoding
|
154
|
+
self.out_proj = nn.Identity()
|
155
|
+
if out_dim != in_dim:
|
156
|
+
self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
|
157
|
+
|
158
|
+
def forward(
|
159
|
+
self,
|
160
|
+
pix_feat: torch.Tensor,
|
161
|
+
masks: torch.Tensor,
|
162
|
+
skip_mask_sigmoid: bool = False,
|
163
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
164
|
+
## Process masks
|
165
|
+
# sigmoid, so that less domain shift from gt masks which are bool
|
166
|
+
if not skip_mask_sigmoid:
|
167
|
+
masks = F.sigmoid(masks)
|
168
|
+
masks = self.mask_downsampler(masks)
|
169
|
+
|
170
|
+
## Fuse pix_feats and downsampled masks
|
171
|
+
# in case the visual features are on CPU, cast them to CUDA
|
172
|
+
pix_feat = pix_feat.to(masks.device)
|
173
|
+
|
174
|
+
x = self.pix_feat_proj(pix_feat)
|
175
|
+
x = x + masks
|
176
|
+
x = self.fuser(x)
|
177
|
+
x = self.out_proj(x)
|
178
|
+
|
179
|
+
pos = self.position_encoding(x).to(x.dtype)
|
180
|
+
|
181
|
+
return {"vision_features": x, "vision_pos_enc": [pos]}
|
@@ -0,0 +1,221 @@
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2
|
+
# All rights reserved.
|
3
|
+
|
4
|
+
# This source code is licensed under the license found in the
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
6
|
+
|
7
|
+
import math
|
8
|
+
from typing import Any, Optional, Tuple
|
9
|
+
|
10
|
+
import numpy as np
|
11
|
+
|
12
|
+
import torch
|
13
|
+
from torch import nn
|
14
|
+
|
15
|
+
|
16
|
+
class PositionEmbeddingSine(nn.Module):
|
17
|
+
"""
|
18
|
+
This is a more standard version of the position embedding, very similar to the one
|
19
|
+
used by the Attention Is All You Need paper, generalized to work on images.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
num_pos_feats,
|
25
|
+
temperature: int = 10000,
|
26
|
+
normalize: bool = True,
|
27
|
+
scale: Optional[float] = None,
|
28
|
+
):
|
29
|
+
super().__init__()
|
30
|
+
assert num_pos_feats % 2 == 0, "Expecting even model width"
|
31
|
+
self.num_pos_feats = num_pos_feats // 2
|
32
|
+
self.temperature = temperature
|
33
|
+
self.normalize = normalize
|
34
|
+
if scale is not None and normalize is False:
|
35
|
+
raise ValueError("normalize should be True if scale is passed")
|
36
|
+
if scale is None:
|
37
|
+
scale = 2 * math.pi
|
38
|
+
self.scale = scale
|
39
|
+
|
40
|
+
self.cache = {}
|
41
|
+
|
42
|
+
def _encode_xy(self, x, y):
|
43
|
+
# The positions are expected to be normalized
|
44
|
+
assert len(x) == len(y) and x.ndim == y.ndim == 1
|
45
|
+
x_embed = x * self.scale
|
46
|
+
y_embed = y * self.scale
|
47
|
+
|
48
|
+
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
|
49
|
+
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
|
50
|
+
|
51
|
+
pos_x = x_embed[:, None] / dim_t
|
52
|
+
pos_y = y_embed[:, None] / dim_t
|
53
|
+
pos_x = torch.stack(
|
54
|
+
(pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2
|
55
|
+
).flatten(1)
|
56
|
+
pos_y = torch.stack(
|
57
|
+
(pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2
|
58
|
+
).flatten(1)
|
59
|
+
return pos_x, pos_y
|
60
|
+
|
61
|
+
@torch.no_grad()
|
62
|
+
def encode_boxes(self, x, y, w, h):
|
63
|
+
pos_x, pos_y = self._encode_xy(x, y)
|
64
|
+
pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
|
65
|
+
return pos
|
66
|
+
|
67
|
+
encode = encode_boxes # Backwards compatibility
|
68
|
+
|
69
|
+
@torch.no_grad()
|
70
|
+
def encode_points(self, x, y, labels):
|
71
|
+
(bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
|
72
|
+
assert bx == by and nx == ny and bx == bl and nx == nl
|
73
|
+
pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
|
74
|
+
pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
|
75
|
+
pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
|
76
|
+
return pos
|
77
|
+
|
78
|
+
@torch.no_grad()
|
79
|
+
def forward(self, x: torch.Tensor):
|
80
|
+
cache_key = (x.shape[-2], x.shape[-1])
|
81
|
+
if cache_key in self.cache:
|
82
|
+
return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1)
|
83
|
+
y_embed = (
|
84
|
+
torch.arange(1, x.shape[-2] + 1, dtype=torch.float32, device=x.device)
|
85
|
+
.view(1, -1, 1)
|
86
|
+
.repeat(x.shape[0], 1, x.shape[-1])
|
87
|
+
)
|
88
|
+
x_embed = (
|
89
|
+
torch.arange(1, x.shape[-1] + 1, dtype=torch.float32, device=x.device)
|
90
|
+
.view(1, 1, -1)
|
91
|
+
.repeat(x.shape[0], x.shape[-2], 1)
|
92
|
+
)
|
93
|
+
|
94
|
+
if self.normalize:
|
95
|
+
eps = 1e-6
|
96
|
+
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
|
97
|
+
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
|
98
|
+
|
99
|
+
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
|
100
|
+
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
|
101
|
+
|
102
|
+
pos_x = x_embed[:, :, :, None] / dim_t
|
103
|
+
pos_y = y_embed[:, :, :, None] / dim_t
|
104
|
+
pos_x = torch.stack(
|
105
|
+
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
|
106
|
+
).flatten(3)
|
107
|
+
pos_y = torch.stack(
|
108
|
+
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
|
109
|
+
).flatten(3)
|
110
|
+
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
|
111
|
+
self.cache[cache_key] = pos[0]
|
112
|
+
return pos
|
113
|
+
|
114
|
+
|
115
|
+
class PositionEmbeddingRandom(nn.Module):
|
116
|
+
"""
|
117
|
+
Positional encoding using random spatial frequencies.
|
118
|
+
"""
|
119
|
+
|
120
|
+
def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
|
121
|
+
super().__init__()
|
122
|
+
if scale is None or scale <= 0.0:
|
123
|
+
scale = 1.0
|
124
|
+
self.register_buffer(
|
125
|
+
"positional_encoding_gaussian_matrix",
|
126
|
+
scale * torch.randn((2, num_pos_feats)),
|
127
|
+
)
|
128
|
+
|
129
|
+
def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
|
130
|
+
"""Positionally encode points that are normalized to [0,1]."""
|
131
|
+
# assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
|
132
|
+
coords = 2 * coords - 1
|
133
|
+
coords = coords @ self.positional_encoding_gaussian_matrix
|
134
|
+
coords = 2 * np.pi * coords
|
135
|
+
# outputs d_1 x ... x d_n x C shape
|
136
|
+
return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
|
137
|
+
|
138
|
+
def forward(self, size: Tuple[int, int]) -> torch.Tensor:
|
139
|
+
"""Generate positional encoding for a grid of the specified size."""
|
140
|
+
h, w = size
|
141
|
+
device: Any = self.positional_encoding_gaussian_matrix.device
|
142
|
+
grid = torch.ones((h, w), device=device, dtype=torch.float32)
|
143
|
+
y_embed = grid.cumsum(dim=0) - 0.5
|
144
|
+
x_embed = grid.cumsum(dim=1) - 0.5
|
145
|
+
y_embed = y_embed / h
|
146
|
+
x_embed = x_embed / w
|
147
|
+
|
148
|
+
pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
|
149
|
+
return pe.permute(2, 0, 1) # C x H x W
|
150
|
+
|
151
|
+
def forward_with_coords(
|
152
|
+
self, coords_input: torch.Tensor, image_size: Tuple[int, int]
|
153
|
+
) -> torch.Tensor:
|
154
|
+
"""Positionally encode points that are not normalized to [0,1]."""
|
155
|
+
coords = coords_input.clone()
|
156
|
+
coords[:, :, 0] = coords[:, :, 0] / image_size[1]
|
157
|
+
coords[:, :, 1] = coords[:, :, 1] / image_size[0]
|
158
|
+
return self._pe_encoding(coords.to(torch.float)) # B x N x C
|
159
|
+
|
160
|
+
|
161
|
+
# Rotary Positional Encoding, adapted from:
|
162
|
+
# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
|
163
|
+
# 2. https://github.com/naver-ai/rope-vit
|
164
|
+
# 3. https://github.com/lucidrains/rotary-embedding-torch
|
165
|
+
|
166
|
+
|
167
|
+
def init_t_xy(end_x: int, end_y: int):
|
168
|
+
t = torch.arange(end_x * end_y, dtype=torch.float32)
|
169
|
+
t_x = (t % end_x).float()
|
170
|
+
t_y = torch.div(t, end_x, rounding_mode="floor").float()
|
171
|
+
return t_x, t_y
|
172
|
+
|
173
|
+
|
174
|
+
def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
|
175
|
+
freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
|
176
|
+
freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
|
177
|
+
|
178
|
+
t_x, t_y = init_t_xy(end_x, end_y)
|
179
|
+
freqs_x = torch.outer(t_x, freqs_x)
|
180
|
+
freqs_y = torch.outer(t_y, freqs_y)
|
181
|
+
freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
|
182
|
+
freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
|
183
|
+
return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
|
184
|
+
|
185
|
+
|
186
|
+
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
|
187
|
+
ndim = x.ndim
|
188
|
+
assert 0 <= 1 < ndim
|
189
|
+
assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
|
190
|
+
shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
|
191
|
+
return freqs_cis.view(*shape)
|
192
|
+
|
193
|
+
|
194
|
+
def apply_rotary_enc(
|
195
|
+
xq: torch.Tensor,
|
196
|
+
xk: torch.Tensor,
|
197
|
+
freqs_cis: torch.Tensor,
|
198
|
+
repeat_freqs_k: bool = False,
|
199
|
+
):
|
200
|
+
xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
|
201
|
+
xk_ = (
|
202
|
+
torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
|
203
|
+
if xk.shape[-2] != 0
|
204
|
+
else None
|
205
|
+
)
|
206
|
+
freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
|
207
|
+
xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
|
208
|
+
if xk_ is None:
|
209
|
+
# no keys to rotate, due to dropout
|
210
|
+
return xq_out.type_as(xq).to(xq.device), xk
|
211
|
+
# repeat freqs along seq_len dim to match k seq_len
|
212
|
+
if repeat_freqs_k:
|
213
|
+
r = xk_.shape[-2] // xq_.shape[-2]
|
214
|
+
if freqs_cis.is_cuda:
|
215
|
+
freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
|
216
|
+
else:
|
217
|
+
# torch.repeat on complex numbers may not be supported on non-CUDA devices
|
218
|
+
# (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
|
219
|
+
freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
|
220
|
+
xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
|
221
|
+
return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
|