ultralytics 8.3.88__py3-none-any.whl → 8.3.90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. tests/conftest.py +2 -2
  2. tests/test_cli.py +13 -11
  3. tests/test_cuda.py +10 -1
  4. tests/test_integrations.py +1 -5
  5. tests/test_python.py +16 -16
  6. tests/test_solutions.py +9 -9
  7. ultralytics/__init__.py +1 -1
  8. ultralytics/cfg/__init__.py +3 -1
  9. ultralytics/cfg/models/11/yolo11-cls.yaml +5 -5
  10. ultralytics/cfg/models/11/yolo11-obb.yaml +5 -5
  11. ultralytics/cfg/models/11/yolo11-pose.yaml +5 -5
  12. ultralytics/cfg/models/11/yolo11-seg.yaml +5 -5
  13. ultralytics/cfg/models/11/yolo11.yaml +5 -5
  14. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +5 -5
  15. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +5 -5
  16. ultralytics/cfg/models/v8/yolov8-ghost.yaml +5 -5
  17. ultralytics/cfg/models/v8/yolov8-obb.yaml +5 -5
  18. ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
  19. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +5 -5
  20. ultralytics/cfg/models/v8/yolov8-world.yaml +5 -5
  21. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +5 -5
  22. ultralytics/cfg/models/v8/yolov8.yaml +5 -5
  23. ultralytics/cfg/models/v9/yolov9c-seg.yaml +1 -1
  24. ultralytics/cfg/models/v9/yolov9c.yaml +1 -1
  25. ultralytics/cfg/models/v9/yolov9e-seg.yaml +1 -1
  26. ultralytics/cfg/models/v9/yolov9e.yaml +1 -1
  27. ultralytics/cfg/models/v9/yolov9m.yaml +1 -1
  28. ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
  29. ultralytics/cfg/models/v9/yolov9t.yaml +1 -1
  30. ultralytics/data/annotator.py +9 -14
  31. ultralytics/data/base.py +125 -39
  32. ultralytics/data/build.py +63 -24
  33. ultralytics/data/converter.py +34 -33
  34. ultralytics/data/dataset.py +207 -53
  35. ultralytics/data/loaders.py +1 -0
  36. ultralytics/data/split_dota.py +39 -12
  37. ultralytics/data/utils.py +33 -47
  38. ultralytics/engine/exporter.py +19 -17
  39. ultralytics/engine/model.py +69 -90
  40. ultralytics/engine/predictor.py +106 -21
  41. ultralytics/engine/trainer.py +32 -23
  42. ultralytics/engine/tuner.py +31 -38
  43. ultralytics/engine/validator.py +75 -41
  44. ultralytics/hub/__init__.py +21 -26
  45. ultralytics/hub/auth.py +9 -12
  46. ultralytics/hub/session.py +76 -21
  47. ultralytics/hub/utils.py +19 -17
  48. ultralytics/models/fastsam/model.py +23 -17
  49. ultralytics/models/fastsam/predict.py +36 -16
  50. ultralytics/models/fastsam/utils.py +5 -5
  51. ultralytics/models/fastsam/val.py +6 -6
  52. ultralytics/models/nas/model.py +29 -24
  53. ultralytics/models/nas/predict.py +14 -11
  54. ultralytics/models/nas/val.py +11 -13
  55. ultralytics/models/rtdetr/model.py +20 -11
  56. ultralytics/models/rtdetr/predict.py +21 -21
  57. ultralytics/models/rtdetr/train.py +25 -24
  58. ultralytics/models/rtdetr/val.py +47 -14
  59. ultralytics/models/sam/__init__.py +1 -1
  60. ultralytics/models/sam/amg.py +50 -4
  61. ultralytics/models/sam/model.py +8 -14
  62. ultralytics/models/sam/modules/decoders.py +18 -21
  63. ultralytics/models/sam/modules/encoders.py +25 -46
  64. ultralytics/models/sam/modules/memory_attention.py +19 -15
  65. ultralytics/models/sam/modules/sam.py +18 -25
  66. ultralytics/models/sam/modules/tiny_encoder.py +19 -29
  67. ultralytics/models/sam/modules/transformer.py +35 -57
  68. ultralytics/models/sam/modules/utils.py +15 -15
  69. ultralytics/models/sam/predict.py +0 -3
  70. ultralytics/models/utils/loss.py +87 -36
  71. ultralytics/models/utils/ops.py +26 -31
  72. ultralytics/models/yolo/classify/predict.py +30 -12
  73. ultralytics/models/yolo/classify/train.py +83 -19
  74. ultralytics/models/yolo/classify/val.py +45 -23
  75. ultralytics/models/yolo/detect/predict.py +29 -19
  76. ultralytics/models/yolo/detect/train.py +90 -23
  77. ultralytics/models/yolo/detect/val.py +150 -29
  78. ultralytics/models/yolo/model.py +1 -2
  79. ultralytics/models/yolo/obb/predict.py +18 -13
  80. ultralytics/models/yolo/obb/train.py +12 -8
  81. ultralytics/models/yolo/obb/val.py +35 -22
  82. ultralytics/models/yolo/pose/predict.py +28 -15
  83. ultralytics/models/yolo/pose/train.py +21 -8
  84. ultralytics/models/yolo/pose/val.py +51 -31
  85. ultralytics/models/yolo/segment/predict.py +27 -16
  86. ultralytics/models/yolo/segment/train.py +11 -8
  87. ultralytics/models/yolo/segment/val.py +110 -29
  88. ultralytics/models/yolo/world/train.py +43 -16
  89. ultralytics/models/yolo/world/train_world.py +61 -36
  90. ultralytics/nn/autobackend.py +28 -14
  91. ultralytics/nn/modules/__init__.py +12 -12
  92. ultralytics/nn/modules/activation.py +12 -3
  93. ultralytics/nn/modules/block.py +587 -84
  94. ultralytics/nn/modules/conv.py +418 -54
  95. ultralytics/nn/modules/head.py +3 -4
  96. ultralytics/nn/modules/transformer.py +320 -34
  97. ultralytics/nn/modules/utils.py +17 -3
  98. ultralytics/nn/tasks.py +226 -79
  99. ultralytics/solutions/ai_gym.py +2 -2
  100. ultralytics/solutions/analytics.py +4 -4
  101. ultralytics/solutions/heatmap.py +4 -4
  102. ultralytics/solutions/instance_segmentation.py +10 -4
  103. ultralytics/solutions/object_blurrer.py +2 -2
  104. ultralytics/solutions/object_counter.py +2 -2
  105. ultralytics/solutions/object_cropper.py +2 -2
  106. ultralytics/solutions/parking_management.py +9 -9
  107. ultralytics/solutions/queue_management.py +1 -1
  108. ultralytics/solutions/region_counter.py +2 -2
  109. ultralytics/solutions/security_alarm.py +7 -7
  110. ultralytics/solutions/solutions.py +7 -4
  111. ultralytics/solutions/speed_estimation.py +2 -2
  112. ultralytics/solutions/streamlit_inference.py +6 -6
  113. ultralytics/solutions/trackzone.py +9 -2
  114. ultralytics/solutions/vision_eye.py +4 -4
  115. ultralytics/trackers/basetrack.py +1 -1
  116. ultralytics/trackers/bot_sort.py +23 -22
  117. ultralytics/trackers/byte_tracker.py +4 -4
  118. ultralytics/trackers/track.py +2 -1
  119. ultralytics/trackers/utils/gmc.py +26 -27
  120. ultralytics/trackers/utils/kalman_filter.py +31 -29
  121. ultralytics/trackers/utils/matching.py +7 -7
  122. ultralytics/utils/__init__.py +37 -35
  123. ultralytics/utils/autobatch.py +5 -5
  124. ultralytics/utils/benchmarks.py +111 -18
  125. ultralytics/utils/callbacks/base.py +3 -3
  126. ultralytics/utils/callbacks/clearml.py +11 -11
  127. ultralytics/utils/callbacks/comet.py +35 -22
  128. ultralytics/utils/callbacks/dvc.py +11 -10
  129. ultralytics/utils/callbacks/hub.py +8 -8
  130. ultralytics/utils/callbacks/mlflow.py +1 -1
  131. ultralytics/utils/callbacks/neptune.py +12 -10
  132. ultralytics/utils/callbacks/raytune.py +1 -1
  133. ultralytics/utils/callbacks/tensorboard.py +6 -6
  134. ultralytics/utils/callbacks/wb.py +16 -16
  135. ultralytics/utils/checks.py +139 -68
  136. ultralytics/utils/dist.py +15 -2
  137. ultralytics/utils/downloads.py +37 -56
  138. ultralytics/utils/files.py +12 -13
  139. ultralytics/utils/instance.py +117 -52
  140. ultralytics/utils/loss.py +28 -33
  141. ultralytics/utils/metrics.py +246 -181
  142. ultralytics/utils/ops.py +65 -61
  143. ultralytics/utils/patches.py +8 -6
  144. ultralytics/utils/plotting.py +72 -59
  145. ultralytics/utils/tal.py +88 -57
  146. ultralytics/utils/torch_utils.py +202 -64
  147. ultralytics/utils/triton.py +13 -3
  148. ultralytics/utils/tuner.py +13 -25
  149. {ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/METADATA +2 -2
  150. ultralytics-8.3.90.dist-info/RECORD +250 -0
  151. ultralytics-8.3.88.dist-info/RECORD +0 -250
  152. {ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/LICENSE +0 -0
  153. {ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/WHEEL +0 -0
  154. {ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/entry_points.txt +0 -0
  155. {ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/top_level.txt +0 -0
@@ -32,7 +32,7 @@ class Detect(nn.Module):
32
32
  legacy = False # backward compatibility for v3/v5/v8/v9 models
33
33
 
34
34
  def __init__(self, nc=80, ch=()):
35
- """Initializes the YOLO detection layer with specified number of classes and channels."""
35
+ """Initialize the YOLO detection layer with specified number of classes and channels."""
36
36
  super().__init__()
37
37
  self.nc = nc # number of classes
38
38
  self.nl = len(ch) # number of detection layers
@@ -273,7 +273,7 @@ class Pose(Detect):
273
273
  else:
274
274
  y = kpts.clone()
275
275
  if ndim == 3:
276
- y[:, 2::3] = y[:, 2::3].sigmoid() # sigmoid (WARNING: inplace .sigmoid_() Apple MPS bug)
276
+ y[:, 2::ndim] = y[:, 2::ndim].sigmoid() # sigmoid (WARNING: inplace .sigmoid_() Apple MPS bug)
277
277
  y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
278
278
  y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
279
279
  return y
@@ -400,7 +400,7 @@ class RTDETRDecoder(nn.Module):
400
400
  nh (int): Number of heads in multi-head attention. Default is 8.
401
401
  ndl (int): Number of decoder layers. Default is 6.
402
402
  d_ffn (int): Dimension of the feed-forward networks. Default is 1024.
403
- dropout (float): Dropout rate. Default is 0.
403
+ dropout (float): Dropout rate. Default is 0.0.
404
404
  act (nn.Module): Activation function. Default is nn.ReLU.
405
405
  eval_idx (int): Evaluation index. Default is -1.
406
406
  nd (int): Number of denoising. Default is 100.
@@ -563,7 +563,6 @@ class RTDETRDecoder(nn.Module):
563
563
 
564
564
  return embeddings, refer_bbox, enc_bboxes, enc_scores
565
565
 
566
- # TODO
567
566
  def _reset_parameters(self):
568
567
  """Initializes or resets the parameters of the model's various components with predefined weights and biases."""
569
568
  # Class and bbox head init
@@ -26,10 +26,34 @@ __all__ = (
26
26
 
27
27
 
28
28
  class TransformerEncoderLayer(nn.Module):
29
- """Defines a single layer of the transformer encoder."""
29
+ """
30
+ Defines a single layer of the transformer encoder.
31
+
32
+ Attributes:
33
+ ma (nn.MultiheadAttention): Multi-head attention module.
34
+ fc1 (nn.Linear): First linear layer in the feedforward network.
35
+ fc2 (nn.Linear): Second linear layer in the feedforward network.
36
+ norm1 (nn.LayerNorm): Layer normalization after attention.
37
+ norm2 (nn.LayerNorm): Layer normalization after feedforward network.
38
+ dropout (nn.Dropout): Dropout layer for the feedforward network.
39
+ dropout1 (nn.Dropout): Dropout layer after attention.
40
+ dropout2 (nn.Dropout): Dropout layer after feedforward network.
41
+ act (nn.Module): Activation function.
42
+ normalize_before (bool): Whether to apply normalization before attention and feedforward.
43
+ """
30
44
 
31
45
  def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
32
- """Initialize the TransformerEncoderLayer with specified parameters."""
46
+ """
47
+ Initialize the TransformerEncoderLayer with specified parameters.
48
+
49
+ Args:
50
+ c1 (int): Input dimension.
51
+ cm (int): Hidden dimension in the feedforward network.
52
+ num_heads (int): Number of attention heads.
53
+ dropout (float): Dropout probability.
54
+ act (nn.Module): Activation function.
55
+ normalize_before (bool): Whether to apply normalization before attention and feedforward.
56
+ """
33
57
  super().__init__()
34
58
  from ...utils.torch_utils import TORCH_1_9
35
59
 
@@ -57,7 +81,18 @@ class TransformerEncoderLayer(nn.Module):
57
81
  return tensor if pos is None else tensor + pos
58
82
 
59
83
  def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
60
- """Performs forward pass with post-normalization."""
84
+ """
85
+ Perform forward pass with post-normalization.
86
+
87
+ Args:
88
+ src (torch.Tensor): Input tensor.
89
+ src_mask (torch.Tensor, optional): Mask for the src sequence.
90
+ src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
91
+ pos (torch.Tensor, optional): Positional encoding.
92
+
93
+ Returns:
94
+ (torch.Tensor): Output tensor after attention and feedforward.
95
+ """
61
96
  q = k = self.with_pos_embed(src, pos)
62
97
  src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
63
98
  src = src + self.dropout1(src2)
@@ -67,7 +102,18 @@ class TransformerEncoderLayer(nn.Module):
67
102
  return self.norm2(src)
68
103
 
69
104
  def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
70
- """Performs forward pass with pre-normalization."""
105
+ """
106
+ Perform forward pass with pre-normalization.
107
+
108
+ Args:
109
+ src (torch.Tensor): Input tensor.
110
+ src_mask (torch.Tensor, optional): Mask for the src sequence.
111
+ src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
112
+ pos (torch.Tensor, optional): Positional encoding.
113
+
114
+ Returns:
115
+ (torch.Tensor): Output tensor after attention and feedforward.
116
+ """
71
117
  src2 = self.norm1(src)
72
118
  q = k = self.with_pos_embed(src2, pos)
73
119
  src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
@@ -77,21 +123,54 @@ class TransformerEncoderLayer(nn.Module):
77
123
  return src + self.dropout2(src2)
78
124
 
79
125
  def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
80
- """Forward propagates the input through the encoder module."""
126
+ """
127
+ Forward propagates the input through the encoder module.
128
+
129
+ Args:
130
+ src (torch.Tensor): Input tensor.
131
+ src_mask (torch.Tensor, optional): Mask for the src sequence.
132
+ src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
133
+ pos (torch.Tensor, optional): Positional encoding.
134
+
135
+ Returns:
136
+ (torch.Tensor): Output tensor after transformer encoder layer.
137
+ """
81
138
  if self.normalize_before:
82
139
  return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
83
140
  return self.forward_post(src, src_mask, src_key_padding_mask, pos)
84
141
 
85
142
 
86
143
  class AIFI(TransformerEncoderLayer):
87
- """Defines the AIFI transformer layer."""
144
+ """
145
+ Defines the AIFI transformer layer.
146
+
147
+ This class extends TransformerEncoderLayer to work with 2D data by adding positional embeddings.
148
+ """
88
149
 
89
150
  def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
90
- """Initialize the AIFI instance with specified parameters."""
151
+ """
152
+ Initialize the AIFI instance with specified parameters.
153
+
154
+ Args:
155
+ c1 (int): Input dimension.
156
+ cm (int): Hidden dimension in the feedforward network.
157
+ num_heads (int): Number of attention heads.
158
+ dropout (float): Dropout probability.
159
+ act (nn.Module): Activation function.
160
+ normalize_before (bool): Whether to apply normalization before attention and feedforward.
161
+ """
91
162
  super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
92
163
 
93
164
  def forward(self, x):
94
- """Forward pass for the AIFI transformer layer."""
165
+ """
166
+ Forward pass for the AIFI transformer layer.
167
+
168
+ Args:
169
+ x (torch.Tensor): Input tensor with shape [B, C, H, W].
170
+
171
+ Returns:
172
+ (torch.Tensor): Output tensor with shape [B, C, H, W].
173
+ """
95
174
  c, h, w = x.shape[1:]
96
175
  pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
97
176
  # Flatten [B, C, H, W] to [B, HxW, C]
@@ -100,7 +179,18 @@ class AIFI(TransformerEncoderLayer):
100
179
 
101
180
  @staticmethod
102
181
  def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
103
- """Builds 2D sine-cosine position embedding."""
182
+ """
183
+ Build 2D sine-cosine position embedding.
184
+
185
+ Args:
186
+ w (int): Width of the feature map.
187
+ h (int): Height of the feature map.
188
+ embed_dim (int): Embedding dimension.
189
+ temperature (float): Temperature for the sine/cosine functions.
190
+
191
+ Returns:
192
+ (torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
193
+ """
104
194
  assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
105
195
  grid_w = torch.arange(w, dtype=torch.float32)
106
196
  grid_h = torch.arange(h, dtype=torch.float32)
@@ -119,7 +209,13 @@ class TransformerLayer(nn.Module):
119
209
  """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
120
210
 
121
211
  def __init__(self, c, num_heads):
122
- """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
212
+ """
213
+ Initialize a self-attention mechanism using linear transformations and multi-head attention.
214
+
215
+ Args:
216
+ c (int): Input and output channel dimension.
217
+ num_heads (int): Number of attention heads.
218
+ """
123
219
  super().__init__()
124
220
  self.q = nn.Linear(c, c, bias=False)
125
221
  self.k = nn.Linear(c, c, bias=False)
@@ -129,16 +225,40 @@ class TransformerLayer(nn.Module):
129
225
  self.fc2 = nn.Linear(c, c, bias=False)
130
226
 
131
227
  def forward(self, x):
132
- """Apply a transformer block to the input x and return the output."""
228
+ """
229
+ Apply a transformer block to the input x and return the output.
230
+
231
+ Args:
232
+ x (torch.Tensor): Input tensor.
233
+
234
+ Returns:
235
+ (torch.Tensor): Output tensor after transformer layer.
236
+ """
133
237
  x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
134
238
  return self.fc2(self.fc1(x)) + x
135
239
 
136
240
 
137
241
  class TransformerBlock(nn.Module):
138
- """Vision Transformer https://arxiv.org/abs/2010.11929."""
242
+ """
243
+ Vision Transformer https://arxiv.org/abs/2010.11929.
244
+
245
+ Attributes:
246
+ conv (Conv, optional): Convolution layer if input and output channels differ.
247
+ linear (nn.Linear): Learnable position embedding.
248
+ tr (nn.Sequential): Sequential container of transformer layers.
249
+ c2 (int): Output channel dimension.
250
+ """
139
251
 
140
252
  def __init__(self, c1, c2, num_heads, num_layers):
141
- """Initialize a Transformer module with position embedding and specified number of heads and layers."""
253
+ """
254
+ Initialize a Transformer module with position embedding and specified number of heads and layers.
255
+
256
+ Args:
257
+ c1 (int): Input channel dimension.
258
+ c2 (int): Output channel dimension.
259
+ num_heads (int): Number of attention heads.
260
+ num_layers (int): Number of transformer layers.
261
+ """
142
262
  super().__init__()
143
263
  self.conv = None
144
264
  if c1 != c2:
@@ -148,7 +268,15 @@ class TransformerBlock(nn.Module):
148
268
  self.c2 = c2
149
269
 
150
270
  def forward(self, x):
151
- """Forward propagates the input through the bottleneck module."""
271
+ """
272
+ Forward propagates the input through the bottleneck module.
273
+
274
+ Args:
275
+ x (torch.Tensor): Input tensor with shape [b, c1, w, h].
276
+
277
+ Returns:
278
+ (torch.Tensor): Output tensor with shape [b, c2, w, h].
279
+ """
152
280
  if self.conv is not None:
153
281
  x = self.conv(x)
154
282
  b, _, w, h = x.shape
@@ -160,22 +288,55 @@ class MLPBlock(nn.Module):
160
288
  """Implements a single block of a multi-layer perceptron."""
161
289
 
162
290
  def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
163
- """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
291
+ """
292
+ Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
293
+
294
+ Args:
295
+ embedding_dim (int): Input and output dimension.
296
+ mlp_dim (int): Hidden dimension.
297
+ act (nn.Module): Activation function.
298
+ """
164
299
  super().__init__()
165
300
  self.lin1 = nn.Linear(embedding_dim, mlp_dim)
166
301
  self.lin2 = nn.Linear(mlp_dim, embedding_dim)
167
302
  self.act = act()
168
303
 
169
304
  def forward(self, x: torch.Tensor) -> torch.Tensor:
170
- """Forward pass for the MLPBlock."""
305
+ """
306
+ Forward pass for the MLPBlock.
307
+
308
+ Args:
309
+ x (torch.Tensor): Input tensor.
310
+
311
+ Returns:
312
+ (torch.Tensor): Output tensor after MLP block.
313
+ """
171
314
  return self.lin2(self.act(self.lin1(x)))
172
315
 
173
316
 
174
317
  class MLP(nn.Module):
175
- """Implements a simple multi-layer perceptron (also called FFN)."""
318
+ """
319
+ Implements a simple multi-layer perceptron (also called FFN).
320
+
321
+ Attributes:
322
+ num_layers (int): Number of layers in the MLP.
323
+ layers (nn.ModuleList): List of linear layers.
324
+ sigmoid (bool): Whether to apply sigmoid to the output.
325
+ act (nn.Module): Activation function.
326
+ """
176
327
 
177
328
  def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act=nn.ReLU, sigmoid=False):
178
- """Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
329
+ """
330
+ Initialize the MLP with specified input, hidden, output dimensions and number of layers.
331
+
332
+ Args:
333
+ input_dim (int): Input dimension.
334
+ hidden_dim (int): Hidden dimension.
335
+ output_dim (int): Output dimension.
336
+ num_layers (int): Number of layers.
337
+ act (nn.Module): Activation function.
338
+ sigmoid (bool): Whether to apply sigmoid to the output.
339
+ """
179
340
  super().__init__()
180
341
  self.num_layers = num_layers
181
342
  h = [hidden_dim] * (num_layers - 1)
@@ -184,7 +345,15 @@ class MLP(nn.Module):
184
345
  self.act = act()
185
346
 
186
347
  def forward(self, x):
187
- """Forward pass for the entire MLP."""
348
+ """
349
+ Forward pass for the entire MLP.
350
+
351
+ Args:
352
+ x (torch.Tensor): Input tensor.
353
+
354
+ Returns:
355
+ (torch.Tensor): Output tensor after MLP.
356
+ """
188
357
  for i, layer in enumerate(self.layers):
189
358
  x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
190
359
  return x.sigmoid() if getattr(self, "sigmoid", False) else x
@@ -198,17 +367,36 @@ class LayerNorm2d(nn.Module):
198
367
  https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
199
368
  and
200
369
  https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
370
+
371
+ Attributes:
372
+ weight (nn.Parameter): Learnable scale parameter.
373
+ bias (nn.Parameter): Learnable bias parameter.
374
+ eps (float): Small constant for numerical stability.
201
375
  """
202
376
 
203
377
  def __init__(self, num_channels, eps=1e-6):
204
- """Initialize LayerNorm2d with the given parameters."""
378
+ """
379
+ Initialize LayerNorm2d with the given parameters.
380
+
381
+ Args:
382
+ num_channels (int): Number of channels in the input.
383
+ eps (float): Small constant for numerical stability.
384
+ """
205
385
  super().__init__()
206
386
  self.weight = nn.Parameter(torch.ones(num_channels))
207
387
  self.bias = nn.Parameter(torch.zeros(num_channels))
208
388
  self.eps = eps
209
389
 
210
390
  def forward(self, x):
211
- """Perform forward pass for 2D layer normalization."""
391
+ """
392
+ Perform forward pass for 2D layer normalization.
393
+
394
+ Args:
395
+ x (torch.Tensor): Input tensor.
396
+
397
+ Returns:
398
+ (torch.Tensor): Normalized output tensor.
399
+ """
212
400
  u = x.mean(1, keepdim=True)
213
401
  s = (x - u).pow(2).mean(1, keepdim=True)
214
402
  x = (x - u) / torch.sqrt(s + self.eps)
@@ -220,10 +408,29 @@ class MSDeformAttn(nn.Module):
220
408
  Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
221
409
 
222
410
  https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
411
+
412
+ Attributes:
413
+ im2col_step (int): Step size for im2col operations.
414
+ d_model (int): Model dimension.
415
+ n_levels (int): Number of feature levels.
416
+ n_heads (int): Number of attention heads.
417
+ n_points (int): Number of sampling points per attention head per feature level.
418
+ sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
419
+ attention_weights (nn.Linear): Linear layer for generating attention weights.
420
+ value_proj (nn.Linear): Linear layer for projecting values.
421
+ output_proj (nn.Linear): Linear layer for projecting output.
223
422
  """
224
423
 
225
424
  def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
226
- """Initialize MSDeformAttn with the given parameters."""
425
+ """
426
+ Initialize MSDeformAttn with the given parameters.
427
+
428
+ Args:
429
+ d_model (int): Model dimension.
430
+ n_levels (int): Number of feature levels.
431
+ n_heads (int): Number of attention heads.
432
+ n_points (int): Number of sampling points per attention head per feature level.
433
+ """
227
434
  super().__init__()
228
435
  if d_model % n_heads != 0:
229
436
  raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
@@ -273,15 +480,16 @@ class MSDeformAttn(nn.Module):
273
480
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
274
481
 
275
482
  Args:
276
- query (torch.Tensor): [bs, query_length, C]
277
- refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
278
- bottom-right (1, 1), including padding area
279
- value (torch.Tensor): [bs, value_length, C]
280
- value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
281
- value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
483
+ query (torch.Tensor): Tensor with shape [bs, query_length, C].
484
+ refer_bbox (torch.Tensor): Tensor with shape [bs, query_length, n_levels, 2], range in [0, 1],
485
+ top-left (0,0), bottom-right (1, 1), including padding area.
486
+ value (torch.Tensor): Tensor with shape [bs, value_length, C].
487
+ value_shapes (List): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
488
+ value_mask (torch.Tensor, optional): Tensor with shape [bs, value_length], True for non-padding elements,
489
+ False for padding elements.
282
490
 
283
491
  Returns:
284
- output (Tensor): [bs, Length_{query}, C]
492
+ (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
285
493
  """
286
494
  bs, len_q = query.shape[:2]
287
495
  len_v = value.shape[1]
@@ -315,10 +523,35 @@ class DeformableTransformerDecoderLayer(nn.Module):
315
523
 
316
524
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
317
525
  https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
526
+
527
+ Attributes:
528
+ self_attn (nn.MultiheadAttention): Self-attention module.
529
+ dropout1 (nn.Dropout): Dropout after self-attention.
530
+ norm1 (nn.LayerNorm): Layer normalization after self-attention.
531
+ cross_attn (MSDeformAttn): Cross-attention module.
532
+ dropout2 (nn.Dropout): Dropout after cross-attention.
533
+ norm2 (nn.LayerNorm): Layer normalization after cross-attention.
534
+ linear1 (nn.Linear): First linear layer in the feedforward network.
535
+ act (nn.Module): Activation function.
536
+ dropout3 (nn.Dropout): Dropout in the feedforward network.
537
+ linear2 (nn.Linear): Second linear layer in the feedforward network.
538
+ dropout4 (nn.Dropout): Dropout after the feedforward network.
539
+ norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
318
540
  """
319
541
 
320
542
  def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
321
- """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
543
+ """
544
+ Initialize the DeformableTransformerDecoderLayer with the given parameters.
545
+
546
+ Args:
547
+ d_model (int): Model dimension.
548
+ n_heads (int): Number of attention heads.
549
+ d_ffn (int): Dimension of the feedforward network.
550
+ dropout (float): Dropout probability.
551
+ act (nn.Module): Activation function.
552
+ n_levels (int): Number of feature levels.
553
+ n_points (int): Number of sampling points.
554
+ """
322
555
  super().__init__()
323
556
 
324
557
  # Self attention
@@ -345,13 +578,35 @@ class DeformableTransformerDecoderLayer(nn.Module):
345
578
  return tensor if pos is None else tensor + pos
346
579
 
347
580
  def forward_ffn(self, tgt):
348
- """Perform forward pass through the Feed-Forward Network part of the layer."""
581
+ """
582
+ Perform forward pass through the Feed-Forward Network part of the layer.
583
+
584
+ Args:
585
+ tgt (torch.Tensor): Input tensor.
586
+
587
+ Returns:
588
+ (torch.Tensor): Output tensor after FFN.
589
+ """
349
590
  tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
350
591
  tgt = tgt + self.dropout4(tgt2)
351
592
  return self.norm3(tgt)
352
593
 
353
594
  def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
354
- """Perform the forward pass through the entire decoder layer."""
595
+ """
596
+ Perform the forward pass through the entire decoder layer.
597
+
598
+ Args:
599
+ embed (torch.Tensor): Input embeddings.
600
+ refer_bbox (torch.Tensor): Reference bounding boxes.
601
+ feats (torch.Tensor): Feature maps.
602
+ shapes (List): Feature shapes.
603
+ padding_mask (torch.Tensor, optional): Padding mask.
604
+ attn_mask (torch.Tensor, optional): Attention mask.
605
+ query_pos (torch.Tensor, optional): Query position embeddings.
606
+
607
+ Returns:
608
+ (torch.Tensor): Output tensor after decoder layer.
609
+ """
355
610
  # Self attention
356
611
  q = k = self.with_pos_embed(embed, query_pos)
357
612
  tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
@@ -376,10 +631,24 @@ class DeformableTransformerDecoder(nn.Module):
376
631
  Implementation of Deformable Transformer Decoder based on PaddleDetection.
377
632
 
378
633
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
634
+
635
+ Attributes:
636
+ layers (nn.ModuleList): List of decoder layers.
637
+ num_layers (int): Number of decoder layers.
638
+ hidden_dim (int): Hidden dimension.
639
+ eval_idx (int): Index of the layer to use during evaluation.
379
640
  """
380
641
 
381
642
  def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
382
- """Initialize the DeformableTransformerDecoder with the given parameters."""
643
+ """
644
+ Initialize the DeformableTransformerDecoder with the given parameters.
645
+
646
+ Args:
647
+ hidden_dim (int): Hidden dimension.
648
+ decoder_layer (nn.Module): Decoder layer module.
649
+ num_layers (int): Number of decoder layers.
650
+ eval_idx (int): Index of the layer to use during evaluation.
651
+ """
383
652
  super().__init__()
384
653
  self.layers = _get_clones(decoder_layer, num_layers)
385
654
  self.num_layers = num_layers
@@ -398,7 +667,24 @@ class DeformableTransformerDecoder(nn.Module):
398
667
  attn_mask=None,
399
668
  padding_mask=None,
400
669
  ):
401
- """Perform the forward pass through the entire decoder."""
670
+ """
671
+ Perform the forward pass through the entire decoder.
672
+
673
+ Args:
674
+ embed (torch.Tensor): Decoder embeddings.
675
+ refer_bbox (torch.Tensor): Reference bounding boxes.
676
+ feats (torch.Tensor): Image features.
677
+ shapes (List): Feature shapes.
678
+ bbox_head (nn.Module): Bounding box prediction head.
679
+ score_head (nn.Module): Score prediction head.
680
+ pos_mlp (nn.Module): Position MLP.
681
+ attn_mask (torch.Tensor, optional): Attention mask.
682
+ padding_mask (torch.Tensor, optional): Padding mask.
683
+
684
+ Returns:
685
+ dec_bboxes (torch.Tensor): Decoded bounding boxes.
686
+ dec_cls (torch.Tensor): Decoded classification scores.
687
+ """
402
688
  output = embed
403
689
  dec_bboxes = []
404
690
  dec_cls = []
@@ -1,5 +1,4 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
- """Module utils."""
3
2
 
4
3
  import copy
5
4
  import math
@@ -46,9 +45,24 @@ def multi_scale_deformable_attn_pytorch(
46
45
  attention_weights: torch.Tensor,
47
46
  ) -> torch.Tensor:
48
47
  """
49
- Multiscale deformable attention.
48
+ Implement multi-scale deformable attention in PyTorch.
50
49
 
51
- https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
50
+ This function performs deformable attention across multiple feature map scales, allowing the model to attend to
51
+ different spatial locations with learned offsets.
52
+
53
+ Args:
54
+ value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
55
+ value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
56
+ sampling_locations (torch.Tensor): The sampling locations with shape
57
+ (bs, num_queries, num_heads, num_levels, num_points, 2).
58
+ attention_weights (torch.Tensor): The attention weights with shape
59
+ (bs, num_queries, num_heads, num_levels, num_points).
60
+
61
+ Returns:
62
+ (torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
63
+
64
+ References:
65
+ https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
52
66
  """
53
67
  bs, _, num_heads, embed_dims = value.shape
54
68
  _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape