ultralytics 8.3.89__py3-none-any.whl → 8.3.91__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/conftest.py +2 -2
- tests/test_cli.py +13 -11
- tests/test_cuda.py +10 -1
- tests/test_exports.py +2 -2
- tests/test_integrations.py +1 -5
- tests/test_python.py +16 -16
- tests/test_solutions.py +9 -9
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +3 -1
- ultralytics/cfg/models/11/yolo11-cls.yaml +5 -5
- ultralytics/cfg/models/11/yolo11-obb.yaml +5 -5
- ultralytics/cfg/models/11/yolo11-pose.yaml +5 -5
- ultralytics/cfg/models/11/yolo11-seg.yaml +5 -5
- ultralytics/cfg/models/11/yolo11.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-ghost.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-obb.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-world.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-worldv2.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8.yaml +5 -5
- ultralytics/cfg/models/v9/yolov9c-seg.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9c.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9e-seg.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9e.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9m.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9t.yaml +1 -1
- ultralytics/data/annotator.py +9 -14
- ultralytics/data/base.py +118 -30
- ultralytics/data/build.py +63 -24
- ultralytics/data/converter.py +5 -5
- ultralytics/data/dataset.py +207 -53
- ultralytics/data/loaders.py +1 -0
- ultralytics/data/split_dota.py +39 -12
- ultralytics/data/utils.py +15 -19
- ultralytics/engine/exporter.py +24 -23
- ultralytics/engine/model.py +67 -88
- ultralytics/engine/predictor.py +106 -21
- ultralytics/engine/trainer.py +32 -23
- ultralytics/engine/tuner.py +21 -18
- ultralytics/engine/validator.py +75 -41
- ultralytics/hub/__init__.py +12 -13
- ultralytics/hub/auth.py +9 -12
- ultralytics/hub/session.py +76 -21
- ultralytics/hub/utils.py +19 -17
- ultralytics/models/fastsam/model.py +20 -11
- ultralytics/models/fastsam/predict.py +36 -16
- ultralytics/models/fastsam/utils.py +5 -5
- ultralytics/models/fastsam/val.py +6 -6
- ultralytics/models/nas/model.py +22 -11
- ultralytics/models/nas/predict.py +9 -4
- ultralytics/models/nas/val.py +5 -5
- ultralytics/models/rtdetr/model.py +20 -11
- ultralytics/models/rtdetr/predict.py +18 -15
- ultralytics/models/rtdetr/train.py +20 -16
- ultralytics/models/rtdetr/val.py +42 -6
- ultralytics/models/sam/__init__.py +1 -1
- ultralytics/models/sam/amg.py +50 -4
- ultralytics/models/sam/model.py +8 -14
- ultralytics/models/sam/modules/decoders.py +18 -21
- ultralytics/models/sam/modules/encoders.py +25 -46
- ultralytics/models/sam/modules/memory_attention.py +19 -15
- ultralytics/models/sam/modules/sam.py +18 -25
- ultralytics/models/sam/modules/tiny_encoder.py +19 -29
- ultralytics/models/sam/modules/transformer.py +35 -57
- ultralytics/models/sam/modules/utils.py +15 -15
- ultralytics/models/sam/predict.py +0 -3
- ultralytics/models/utils/loss.py +87 -36
- ultralytics/models/utils/ops.py +26 -31
- ultralytics/models/yolo/classify/predict.py +24 -3
- ultralytics/models/yolo/classify/train.py +77 -10
- ultralytics/models/yolo/classify/val.py +40 -15
- ultralytics/models/yolo/detect/predict.py +23 -10
- ultralytics/models/yolo/detect/train.py +85 -15
- ultralytics/models/yolo/detect/val.py +145 -21
- ultralytics/models/yolo/model.py +1 -2
- ultralytics/models/yolo/obb/predict.py +12 -4
- ultralytics/models/yolo/obb/train.py +7 -0
- ultralytics/models/yolo/obb/val.py +25 -7
- ultralytics/models/yolo/pose/predict.py +22 -6
- ultralytics/models/yolo/pose/train.py +17 -1
- ultralytics/models/yolo/pose/val.py +46 -21
- ultralytics/models/yolo/segment/predict.py +22 -8
- ultralytics/models/yolo/segment/train.py +6 -0
- ultralytics/models/yolo/segment/val.py +100 -14
- ultralytics/models/yolo/world/train.py +38 -8
- ultralytics/models/yolo/world/train_world.py +39 -10
- ultralytics/nn/autobackend.py +28 -14
- ultralytics/nn/modules/__init__.py +3 -0
- ultralytics/nn/modules/activation.py +12 -3
- ultralytics/nn/modules/block.py +587 -84
- ultralytics/nn/modules/conv.py +418 -54
- ultralytics/nn/modules/head.py +3 -4
- ultralytics/nn/modules/transformer.py +320 -34
- ultralytics/nn/modules/utils.py +17 -3
- ultralytics/nn/tasks.py +221 -69
- ultralytics/solutions/ai_gym.py +2 -2
- ultralytics/solutions/analytics.py +4 -4
- ultralytics/solutions/heatmap.py +4 -4
- ultralytics/solutions/instance_segmentation.py +10 -4
- ultralytics/solutions/object_blurrer.py +2 -2
- ultralytics/solutions/object_counter.py +2 -2
- ultralytics/solutions/object_cropper.py +2 -2
- ultralytics/solutions/parking_management.py +9 -9
- ultralytics/solutions/queue_management.py +1 -1
- ultralytics/solutions/region_counter.py +2 -2
- ultralytics/solutions/security_alarm.py +7 -7
- ultralytics/solutions/solutions.py +7 -4
- ultralytics/solutions/speed_estimation.py +2 -2
- ultralytics/solutions/streamlit_inference.py +6 -6
- ultralytics/solutions/trackzone.py +9 -2
- ultralytics/solutions/vision_eye.py +4 -4
- ultralytics/trackers/basetrack.py +1 -1
- ultralytics/trackers/bot_sort.py +23 -22
- ultralytics/trackers/byte_tracker.py +4 -4
- ultralytics/trackers/track.py +2 -1
- ultralytics/trackers/utils/gmc.py +26 -27
- ultralytics/trackers/utils/kalman_filter.py +31 -29
- ultralytics/trackers/utils/matching.py +7 -7
- ultralytics/utils/__init__.py +32 -27
- ultralytics/utils/autobatch.py +5 -5
- ultralytics/utils/benchmarks.py +111 -18
- ultralytics/utils/callbacks/base.py +3 -3
- ultralytics/utils/callbacks/clearml.py +11 -11
- ultralytics/utils/callbacks/comet.py +42 -24
- ultralytics/utils/callbacks/dvc.py +11 -10
- ultralytics/utils/callbacks/hub.py +8 -8
- ultralytics/utils/callbacks/mlflow.py +1 -1
- ultralytics/utils/callbacks/neptune.py +12 -10
- ultralytics/utils/callbacks/raytune.py +1 -1
- ultralytics/utils/callbacks/tensorboard.py +6 -6
- ultralytics/utils/callbacks/wb.py +16 -16
- ultralytics/utils/checks.py +116 -35
- ultralytics/utils/dist.py +15 -2
- ultralytics/utils/downloads.py +13 -9
- ultralytics/utils/files.py +12 -13
- ultralytics/utils/instance.py +112 -45
- ultralytics/utils/loss.py +28 -33
- ultralytics/utils/metrics.py +246 -181
- ultralytics/utils/ops.py +61 -53
- ultralytics/utils/patches.py +8 -6
- ultralytics/utils/plotting.py +65 -45
- ultralytics/utils/tal.py +88 -57
- ultralytics/utils/torch_utils.py +181 -33
- ultralytics/utils/triton.py +13 -3
- ultralytics/utils/tuner.py +8 -16
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/METADATA +1 -1
- ultralytics-8.3.91.dist-info/RECORD +250 -0
- ultralytics-8.3.89.dist-info/RECORD +0 -250
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/LICENSE +0 -0
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/top_level.txt +0 -0
ultralytics/nn/modules/head.py
CHANGED
@@ -32,7 +32,7 @@ class Detect(nn.Module):
|
|
32
32
|
legacy = False # backward compatibility for v3/v5/v8/v9 models
|
33
33
|
|
34
34
|
def __init__(self, nc=80, ch=()):
|
35
|
-
"""
|
35
|
+
"""Initialize the YOLO detection layer with specified number of classes and channels."""
|
36
36
|
super().__init__()
|
37
37
|
self.nc = nc # number of classes
|
38
38
|
self.nl = len(ch) # number of detection layers
|
@@ -273,7 +273,7 @@ class Pose(Detect):
|
|
273
273
|
else:
|
274
274
|
y = kpts.clone()
|
275
275
|
if ndim == 3:
|
276
|
-
y[:, 2::
|
276
|
+
y[:, 2::ndim] = y[:, 2::ndim].sigmoid() # sigmoid (WARNING: inplace .sigmoid_() Apple MPS bug)
|
277
277
|
y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
|
278
278
|
y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
|
279
279
|
return y
|
@@ -400,7 +400,7 @@ class RTDETRDecoder(nn.Module):
|
|
400
400
|
nh (int): Number of heads in multi-head attention. Default is 8.
|
401
401
|
ndl (int): Number of decoder layers. Default is 6.
|
402
402
|
d_ffn (int): Dimension of the feed-forward networks. Default is 1024.
|
403
|
-
dropout (float): Dropout rate. Default is 0.
|
403
|
+
dropout (float): Dropout rate. Default is 0.0.
|
404
404
|
act (nn.Module): Activation function. Default is nn.ReLU.
|
405
405
|
eval_idx (int): Evaluation index. Default is -1.
|
406
406
|
nd (int): Number of denoising. Default is 100.
|
@@ -563,7 +563,6 @@ class RTDETRDecoder(nn.Module):
|
|
563
563
|
|
564
564
|
return embeddings, refer_bbox, enc_bboxes, enc_scores
|
565
565
|
|
566
|
-
# TODO
|
567
566
|
def _reset_parameters(self):
|
568
567
|
"""Initializes or resets the parameters of the model's various components with predefined weights and biases."""
|
569
568
|
# Class and bbox head init
|
@@ -26,10 +26,34 @@ __all__ = (
|
|
26
26
|
|
27
27
|
|
28
28
|
class TransformerEncoderLayer(nn.Module):
|
29
|
-
"""
|
29
|
+
"""
|
30
|
+
Defines a single layer of the transformer encoder.
|
31
|
+
|
32
|
+
Attributes:
|
33
|
+
ma (nn.MultiheadAttention): Multi-head attention module.
|
34
|
+
fc1 (nn.Linear): First linear layer in the feedforward network.
|
35
|
+
fc2 (nn.Linear): Second linear layer in the feedforward network.
|
36
|
+
norm1 (nn.LayerNorm): Layer normalization after attention.
|
37
|
+
norm2 (nn.LayerNorm): Layer normalization after feedforward network.
|
38
|
+
dropout (nn.Dropout): Dropout layer for the feedforward network.
|
39
|
+
dropout1 (nn.Dropout): Dropout layer after attention.
|
40
|
+
dropout2 (nn.Dropout): Dropout layer after feedforward network.
|
41
|
+
act (nn.Module): Activation function.
|
42
|
+
normalize_before (bool): Whether to apply normalization before attention and feedforward.
|
43
|
+
"""
|
30
44
|
|
31
45
|
def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
|
32
|
-
"""
|
46
|
+
"""
|
47
|
+
Initialize the TransformerEncoderLayer with specified parameters.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
c1 (int): Input dimension.
|
51
|
+
cm (int): Hidden dimension in the feedforward network.
|
52
|
+
num_heads (int): Number of attention heads.
|
53
|
+
dropout (float): Dropout probability.
|
54
|
+
act (nn.Module): Activation function.
|
55
|
+
normalize_before (bool): Whether to apply normalization before attention and feedforward.
|
56
|
+
"""
|
33
57
|
super().__init__()
|
34
58
|
from ...utils.torch_utils import TORCH_1_9
|
35
59
|
|
@@ -57,7 +81,18 @@ class TransformerEncoderLayer(nn.Module):
|
|
57
81
|
return tensor if pos is None else tensor + pos
|
58
82
|
|
59
83
|
def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
|
60
|
-
"""
|
84
|
+
"""
|
85
|
+
Perform forward pass with post-normalization.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
src (torch.Tensor): Input tensor.
|
89
|
+
src_mask (torch.Tensor, optional): Mask for the src sequence.
|
90
|
+
src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
|
91
|
+
pos (torch.Tensor, optional): Positional encoding.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
(torch.Tensor): Output tensor after attention and feedforward.
|
95
|
+
"""
|
61
96
|
q = k = self.with_pos_embed(src, pos)
|
62
97
|
src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
|
63
98
|
src = src + self.dropout1(src2)
|
@@ -67,7 +102,18 @@ class TransformerEncoderLayer(nn.Module):
|
|
67
102
|
return self.norm2(src)
|
68
103
|
|
69
104
|
def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
|
70
|
-
"""
|
105
|
+
"""
|
106
|
+
Perform forward pass with pre-normalization.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
src (torch.Tensor): Input tensor.
|
110
|
+
src_mask (torch.Tensor, optional): Mask for the src sequence.
|
111
|
+
src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
|
112
|
+
pos (torch.Tensor, optional): Positional encoding.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
(torch.Tensor): Output tensor after attention and feedforward.
|
116
|
+
"""
|
71
117
|
src2 = self.norm1(src)
|
72
118
|
q = k = self.with_pos_embed(src2, pos)
|
73
119
|
src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
|
@@ -77,21 +123,54 @@ class TransformerEncoderLayer(nn.Module):
|
|
77
123
|
return src + self.dropout2(src2)
|
78
124
|
|
79
125
|
def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
|
80
|
-
"""
|
126
|
+
"""
|
127
|
+
Forward propagates the input through the encoder module.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
src (torch.Tensor): Input tensor.
|
131
|
+
src_mask (torch.Tensor, optional): Mask for the src sequence.
|
132
|
+
src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
|
133
|
+
pos (torch.Tensor, optional): Positional encoding.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
(torch.Tensor): Output tensor after transformer encoder layer.
|
137
|
+
"""
|
81
138
|
if self.normalize_before:
|
82
139
|
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
|
83
140
|
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
|
84
141
|
|
85
142
|
|
86
143
|
class AIFI(TransformerEncoderLayer):
|
87
|
-
"""
|
144
|
+
"""
|
145
|
+
Defines the AIFI transformer layer.
|
146
|
+
|
147
|
+
This class extends TransformerEncoderLayer to work with 2D data by adding positional embeddings.
|
148
|
+
"""
|
88
149
|
|
89
150
|
def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
|
90
|
-
"""
|
151
|
+
"""
|
152
|
+
Initialize the AIFI instance with specified parameters.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
c1 (int): Input dimension.
|
156
|
+
cm (int): Hidden dimension in the feedforward network.
|
157
|
+
num_heads (int): Number of attention heads.
|
158
|
+
dropout (float): Dropout probability.
|
159
|
+
act (nn.Module): Activation function.
|
160
|
+
normalize_before (bool): Whether to apply normalization before attention and feedforward.
|
161
|
+
"""
|
91
162
|
super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
|
92
163
|
|
93
164
|
def forward(self, x):
|
94
|
-
"""
|
165
|
+
"""
|
166
|
+
Forward pass for the AIFI transformer layer.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
x (torch.Tensor): Input tensor with shape [B, C, H, W].
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
(torch.Tensor): Output tensor with shape [B, C, H, W].
|
173
|
+
"""
|
95
174
|
c, h, w = x.shape[1:]
|
96
175
|
pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
|
97
176
|
# Flatten [B, C, H, W] to [B, HxW, C]
|
@@ -100,7 +179,18 @@ class AIFI(TransformerEncoderLayer):
|
|
100
179
|
|
101
180
|
@staticmethod
|
102
181
|
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
|
103
|
-
"""
|
182
|
+
"""
|
183
|
+
Build 2D sine-cosine position embedding.
|
184
|
+
|
185
|
+
Args:
|
186
|
+
w (int): Width of the feature map.
|
187
|
+
h (int): Height of the feature map.
|
188
|
+
embed_dim (int): Embedding dimension.
|
189
|
+
temperature (float): Temperature for the sine/cosine functions.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
(torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
|
193
|
+
"""
|
104
194
|
assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
|
105
195
|
grid_w = torch.arange(w, dtype=torch.float32)
|
106
196
|
grid_h = torch.arange(h, dtype=torch.float32)
|
@@ -119,7 +209,13 @@ class TransformerLayer(nn.Module):
|
|
119
209
|
"""Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
|
120
210
|
|
121
211
|
def __init__(self, c, num_heads):
|
122
|
-
"""
|
212
|
+
"""
|
213
|
+
Initialize a self-attention mechanism using linear transformations and multi-head attention.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
c (int): Input and output channel dimension.
|
217
|
+
num_heads (int): Number of attention heads.
|
218
|
+
"""
|
123
219
|
super().__init__()
|
124
220
|
self.q = nn.Linear(c, c, bias=False)
|
125
221
|
self.k = nn.Linear(c, c, bias=False)
|
@@ -129,16 +225,40 @@ class TransformerLayer(nn.Module):
|
|
129
225
|
self.fc2 = nn.Linear(c, c, bias=False)
|
130
226
|
|
131
227
|
def forward(self, x):
|
132
|
-
"""
|
228
|
+
"""
|
229
|
+
Apply a transformer block to the input x and return the output.
|
230
|
+
|
231
|
+
Args:
|
232
|
+
x (torch.Tensor): Input tensor.
|
233
|
+
|
234
|
+
Returns:
|
235
|
+
(torch.Tensor): Output tensor after transformer layer.
|
236
|
+
"""
|
133
237
|
x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
|
134
238
|
return self.fc2(self.fc1(x)) + x
|
135
239
|
|
136
240
|
|
137
241
|
class TransformerBlock(nn.Module):
|
138
|
-
"""
|
242
|
+
"""
|
243
|
+
Vision Transformer https://arxiv.org/abs/2010.11929.
|
244
|
+
|
245
|
+
Attributes:
|
246
|
+
conv (Conv, optional): Convolution layer if input and output channels differ.
|
247
|
+
linear (nn.Linear): Learnable position embedding.
|
248
|
+
tr (nn.Sequential): Sequential container of transformer layers.
|
249
|
+
c2 (int): Output channel dimension.
|
250
|
+
"""
|
139
251
|
|
140
252
|
def __init__(self, c1, c2, num_heads, num_layers):
|
141
|
-
"""
|
253
|
+
"""
|
254
|
+
Initialize a Transformer module with position embedding and specified number of heads and layers.
|
255
|
+
|
256
|
+
Args:
|
257
|
+
c1 (int): Input channel dimension.
|
258
|
+
c2 (int): Output channel dimension.
|
259
|
+
num_heads (int): Number of attention heads.
|
260
|
+
num_layers (int): Number of transformer layers.
|
261
|
+
"""
|
142
262
|
super().__init__()
|
143
263
|
self.conv = None
|
144
264
|
if c1 != c2:
|
@@ -148,7 +268,15 @@ class TransformerBlock(nn.Module):
|
|
148
268
|
self.c2 = c2
|
149
269
|
|
150
270
|
def forward(self, x):
|
151
|
-
"""
|
271
|
+
"""
|
272
|
+
Forward propagates the input through the bottleneck module.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
x (torch.Tensor): Input tensor with shape [b, c1, w, h].
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
(torch.Tensor): Output tensor with shape [b, c2, w, h].
|
279
|
+
"""
|
152
280
|
if self.conv is not None:
|
153
281
|
x = self.conv(x)
|
154
282
|
b, _, w, h = x.shape
|
@@ -160,22 +288,55 @@ class MLPBlock(nn.Module):
|
|
160
288
|
"""Implements a single block of a multi-layer perceptron."""
|
161
289
|
|
162
290
|
def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
|
163
|
-
"""
|
291
|
+
"""
|
292
|
+
Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
embedding_dim (int): Input and output dimension.
|
296
|
+
mlp_dim (int): Hidden dimension.
|
297
|
+
act (nn.Module): Activation function.
|
298
|
+
"""
|
164
299
|
super().__init__()
|
165
300
|
self.lin1 = nn.Linear(embedding_dim, mlp_dim)
|
166
301
|
self.lin2 = nn.Linear(mlp_dim, embedding_dim)
|
167
302
|
self.act = act()
|
168
303
|
|
169
304
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
170
|
-
"""
|
305
|
+
"""
|
306
|
+
Forward pass for the MLPBlock.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
x (torch.Tensor): Input tensor.
|
310
|
+
|
311
|
+
Returns:
|
312
|
+
(torch.Tensor): Output tensor after MLP block.
|
313
|
+
"""
|
171
314
|
return self.lin2(self.act(self.lin1(x)))
|
172
315
|
|
173
316
|
|
174
317
|
class MLP(nn.Module):
|
175
|
-
"""
|
318
|
+
"""
|
319
|
+
Implements a simple multi-layer perceptron (also called FFN).
|
320
|
+
|
321
|
+
Attributes:
|
322
|
+
num_layers (int): Number of layers in the MLP.
|
323
|
+
layers (nn.ModuleList): List of linear layers.
|
324
|
+
sigmoid (bool): Whether to apply sigmoid to the output.
|
325
|
+
act (nn.Module): Activation function.
|
326
|
+
"""
|
176
327
|
|
177
328
|
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act=nn.ReLU, sigmoid=False):
|
178
|
-
"""
|
329
|
+
"""
|
330
|
+
Initialize the MLP with specified input, hidden, output dimensions and number of layers.
|
331
|
+
|
332
|
+
Args:
|
333
|
+
input_dim (int): Input dimension.
|
334
|
+
hidden_dim (int): Hidden dimension.
|
335
|
+
output_dim (int): Output dimension.
|
336
|
+
num_layers (int): Number of layers.
|
337
|
+
act (nn.Module): Activation function.
|
338
|
+
sigmoid (bool): Whether to apply sigmoid to the output.
|
339
|
+
"""
|
179
340
|
super().__init__()
|
180
341
|
self.num_layers = num_layers
|
181
342
|
h = [hidden_dim] * (num_layers - 1)
|
@@ -184,7 +345,15 @@ class MLP(nn.Module):
|
|
184
345
|
self.act = act()
|
185
346
|
|
186
347
|
def forward(self, x):
|
187
|
-
"""
|
348
|
+
"""
|
349
|
+
Forward pass for the entire MLP.
|
350
|
+
|
351
|
+
Args:
|
352
|
+
x (torch.Tensor): Input tensor.
|
353
|
+
|
354
|
+
Returns:
|
355
|
+
(torch.Tensor): Output tensor after MLP.
|
356
|
+
"""
|
188
357
|
for i, layer in enumerate(self.layers):
|
189
358
|
x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
|
190
359
|
return x.sigmoid() if getattr(self, "sigmoid", False) else x
|
@@ -198,17 +367,36 @@ class LayerNorm2d(nn.Module):
|
|
198
367
|
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
|
199
368
|
and
|
200
369
|
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
|
370
|
+
|
371
|
+
Attributes:
|
372
|
+
weight (nn.Parameter): Learnable scale parameter.
|
373
|
+
bias (nn.Parameter): Learnable bias parameter.
|
374
|
+
eps (float): Small constant for numerical stability.
|
201
375
|
"""
|
202
376
|
|
203
377
|
def __init__(self, num_channels, eps=1e-6):
|
204
|
-
"""
|
378
|
+
"""
|
379
|
+
Initialize LayerNorm2d with the given parameters.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
num_channels (int): Number of channels in the input.
|
383
|
+
eps (float): Small constant for numerical stability.
|
384
|
+
"""
|
205
385
|
super().__init__()
|
206
386
|
self.weight = nn.Parameter(torch.ones(num_channels))
|
207
387
|
self.bias = nn.Parameter(torch.zeros(num_channels))
|
208
388
|
self.eps = eps
|
209
389
|
|
210
390
|
def forward(self, x):
|
211
|
-
"""
|
391
|
+
"""
|
392
|
+
Perform forward pass for 2D layer normalization.
|
393
|
+
|
394
|
+
Args:
|
395
|
+
x (torch.Tensor): Input tensor.
|
396
|
+
|
397
|
+
Returns:
|
398
|
+
(torch.Tensor): Normalized output tensor.
|
399
|
+
"""
|
212
400
|
u = x.mean(1, keepdim=True)
|
213
401
|
s = (x - u).pow(2).mean(1, keepdim=True)
|
214
402
|
x = (x - u) / torch.sqrt(s + self.eps)
|
@@ -220,10 +408,29 @@ class MSDeformAttn(nn.Module):
|
|
220
408
|
Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
221
409
|
|
222
410
|
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
|
411
|
+
|
412
|
+
Attributes:
|
413
|
+
im2col_step (int): Step size for im2col operations.
|
414
|
+
d_model (int): Model dimension.
|
415
|
+
n_levels (int): Number of feature levels.
|
416
|
+
n_heads (int): Number of attention heads.
|
417
|
+
n_points (int): Number of sampling points per attention head per feature level.
|
418
|
+
sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
|
419
|
+
attention_weights (nn.Linear): Linear layer for generating attention weights.
|
420
|
+
value_proj (nn.Linear): Linear layer for projecting values.
|
421
|
+
output_proj (nn.Linear): Linear layer for projecting output.
|
223
422
|
"""
|
224
423
|
|
225
424
|
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
|
226
|
-
"""
|
425
|
+
"""
|
426
|
+
Initialize MSDeformAttn with the given parameters.
|
427
|
+
|
428
|
+
Args:
|
429
|
+
d_model (int): Model dimension.
|
430
|
+
n_levels (int): Number of feature levels.
|
431
|
+
n_heads (int): Number of attention heads.
|
432
|
+
n_points (int): Number of sampling points per attention head per feature level.
|
433
|
+
"""
|
227
434
|
super().__init__()
|
228
435
|
if d_model % n_heads != 0:
|
229
436
|
raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
|
@@ -273,15 +480,16 @@ class MSDeformAttn(nn.Module):
|
|
273
480
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
274
481
|
|
275
482
|
Args:
|
276
|
-
query (torch.Tensor): [bs, query_length, C]
|
277
|
-
refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1],
|
278
|
-
bottom-right (1, 1), including padding area
|
279
|
-
value (torch.Tensor): [bs, value_length, C]
|
280
|
-
value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
281
|
-
value_mask (Tensor): [bs, value_length], True for non-padding elements,
|
483
|
+
query (torch.Tensor): Tensor with shape [bs, query_length, C].
|
484
|
+
refer_bbox (torch.Tensor): Tensor with shape [bs, query_length, n_levels, 2], range in [0, 1],
|
485
|
+
top-left (0,0), bottom-right (1, 1), including padding area.
|
486
|
+
value (torch.Tensor): Tensor with shape [bs, value_length, C].
|
487
|
+
value_shapes (List): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
|
488
|
+
value_mask (torch.Tensor, optional): Tensor with shape [bs, value_length], True for non-padding elements,
|
489
|
+
False for padding elements.
|
282
490
|
|
283
491
|
Returns:
|
284
|
-
|
492
|
+
(torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
|
285
493
|
"""
|
286
494
|
bs, len_q = query.shape[:2]
|
287
495
|
len_v = value.shape[1]
|
@@ -315,10 +523,35 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
315
523
|
|
316
524
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
317
525
|
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
|
526
|
+
|
527
|
+
Attributes:
|
528
|
+
self_attn (nn.MultiheadAttention): Self-attention module.
|
529
|
+
dropout1 (nn.Dropout): Dropout after self-attention.
|
530
|
+
norm1 (nn.LayerNorm): Layer normalization after self-attention.
|
531
|
+
cross_attn (MSDeformAttn): Cross-attention module.
|
532
|
+
dropout2 (nn.Dropout): Dropout after cross-attention.
|
533
|
+
norm2 (nn.LayerNorm): Layer normalization after cross-attention.
|
534
|
+
linear1 (nn.Linear): First linear layer in the feedforward network.
|
535
|
+
act (nn.Module): Activation function.
|
536
|
+
dropout3 (nn.Dropout): Dropout in the feedforward network.
|
537
|
+
linear2 (nn.Linear): Second linear layer in the feedforward network.
|
538
|
+
dropout4 (nn.Dropout): Dropout after the feedforward network.
|
539
|
+
norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
|
318
540
|
"""
|
319
541
|
|
320
542
|
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
|
321
|
-
"""
|
543
|
+
"""
|
544
|
+
Initialize the DeformableTransformerDecoderLayer with the given parameters.
|
545
|
+
|
546
|
+
Args:
|
547
|
+
d_model (int): Model dimension.
|
548
|
+
n_heads (int): Number of attention heads.
|
549
|
+
d_ffn (int): Dimension of the feedforward network.
|
550
|
+
dropout (float): Dropout probability.
|
551
|
+
act (nn.Module): Activation function.
|
552
|
+
n_levels (int): Number of feature levels.
|
553
|
+
n_points (int): Number of sampling points.
|
554
|
+
"""
|
322
555
|
super().__init__()
|
323
556
|
|
324
557
|
# Self attention
|
@@ -345,13 +578,35 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
345
578
|
return tensor if pos is None else tensor + pos
|
346
579
|
|
347
580
|
def forward_ffn(self, tgt):
|
348
|
-
"""
|
581
|
+
"""
|
582
|
+
Perform forward pass through the Feed-Forward Network part of the layer.
|
583
|
+
|
584
|
+
Args:
|
585
|
+
tgt (torch.Tensor): Input tensor.
|
586
|
+
|
587
|
+
Returns:
|
588
|
+
(torch.Tensor): Output tensor after FFN.
|
589
|
+
"""
|
349
590
|
tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
|
350
591
|
tgt = tgt + self.dropout4(tgt2)
|
351
592
|
return self.norm3(tgt)
|
352
593
|
|
353
594
|
def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
|
354
|
-
"""
|
595
|
+
"""
|
596
|
+
Perform the forward pass through the entire decoder layer.
|
597
|
+
|
598
|
+
Args:
|
599
|
+
embed (torch.Tensor): Input embeddings.
|
600
|
+
refer_bbox (torch.Tensor): Reference bounding boxes.
|
601
|
+
feats (torch.Tensor): Feature maps.
|
602
|
+
shapes (List): Feature shapes.
|
603
|
+
padding_mask (torch.Tensor, optional): Padding mask.
|
604
|
+
attn_mask (torch.Tensor, optional): Attention mask.
|
605
|
+
query_pos (torch.Tensor, optional): Query position embeddings.
|
606
|
+
|
607
|
+
Returns:
|
608
|
+
(torch.Tensor): Output tensor after decoder layer.
|
609
|
+
"""
|
355
610
|
# Self attention
|
356
611
|
q = k = self.with_pos_embed(embed, query_pos)
|
357
612
|
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
|
@@ -376,10 +631,24 @@ class DeformableTransformerDecoder(nn.Module):
|
|
376
631
|
Implementation of Deformable Transformer Decoder based on PaddleDetection.
|
377
632
|
|
378
633
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
634
|
+
|
635
|
+
Attributes:
|
636
|
+
layers (nn.ModuleList): List of decoder layers.
|
637
|
+
num_layers (int): Number of decoder layers.
|
638
|
+
hidden_dim (int): Hidden dimension.
|
639
|
+
eval_idx (int): Index of the layer to use during evaluation.
|
379
640
|
"""
|
380
641
|
|
381
642
|
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
|
382
|
-
"""
|
643
|
+
"""
|
644
|
+
Initialize the DeformableTransformerDecoder with the given parameters.
|
645
|
+
|
646
|
+
Args:
|
647
|
+
hidden_dim (int): Hidden dimension.
|
648
|
+
decoder_layer (nn.Module): Decoder layer module.
|
649
|
+
num_layers (int): Number of decoder layers.
|
650
|
+
eval_idx (int): Index of the layer to use during evaluation.
|
651
|
+
"""
|
383
652
|
super().__init__()
|
384
653
|
self.layers = _get_clones(decoder_layer, num_layers)
|
385
654
|
self.num_layers = num_layers
|
@@ -398,7 +667,24 @@ class DeformableTransformerDecoder(nn.Module):
|
|
398
667
|
attn_mask=None,
|
399
668
|
padding_mask=None,
|
400
669
|
):
|
401
|
-
"""
|
670
|
+
"""
|
671
|
+
Perform the forward pass through the entire decoder.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
embed (torch.Tensor): Decoder embeddings.
|
675
|
+
refer_bbox (torch.Tensor): Reference bounding boxes.
|
676
|
+
feats (torch.Tensor): Image features.
|
677
|
+
shapes (List): Feature shapes.
|
678
|
+
bbox_head (nn.Module): Bounding box prediction head.
|
679
|
+
score_head (nn.Module): Score prediction head.
|
680
|
+
pos_mlp (nn.Module): Position MLP.
|
681
|
+
attn_mask (torch.Tensor, optional): Attention mask.
|
682
|
+
padding_mask (torch.Tensor, optional): Padding mask.
|
683
|
+
|
684
|
+
Returns:
|
685
|
+
dec_bboxes (torch.Tensor): Decoded bounding boxes.
|
686
|
+
dec_cls (torch.Tensor): Decoded classification scores.
|
687
|
+
"""
|
402
688
|
output = embed
|
403
689
|
dec_bboxes = []
|
404
690
|
dec_cls = []
|
ultralytics/nn/modules/utils.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
|
-
"""Module utils."""
|
3
2
|
|
4
3
|
import copy
|
5
4
|
import math
|
@@ -46,9 +45,24 @@ def multi_scale_deformable_attn_pytorch(
|
|
46
45
|
attention_weights: torch.Tensor,
|
47
46
|
) -> torch.Tensor:
|
48
47
|
"""
|
49
|
-
|
48
|
+
Implement multi-scale deformable attention in PyTorch.
|
50
49
|
|
51
|
-
|
50
|
+
This function performs deformable attention across multiple feature map scales, allowing the model to attend to
|
51
|
+
different spatial locations with learned offsets.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
|
55
|
+
value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
|
56
|
+
sampling_locations (torch.Tensor): The sampling locations with shape
|
57
|
+
(bs, num_queries, num_heads, num_levels, num_points, 2).
|
58
|
+
attention_weights (torch.Tensor): The attention weights with shape
|
59
|
+
(bs, num_queries, num_heads, num_levels, num_points).
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
(torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
|
63
|
+
|
64
|
+
References:
|
65
|
+
https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
|
52
66
|
"""
|
53
67
|
bs, _, num_heads, embed_dims = value.shape
|
54
68
|
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
|