dgenerate-ultralytics-headless 8.3.196__py3-none-any.whl → 8.3.248__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (243) hide show
  1. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/METADATA +33 -34
  2. dgenerate_ultralytics_headless-8.3.248.dist-info/RECORD +298 -0
  3. tests/__init__.py +5 -7
  4. tests/conftest.py +8 -15
  5. tests/test_cli.py +8 -10
  6. tests/test_cuda.py +9 -10
  7. tests/test_engine.py +29 -2
  8. tests/test_exports.py +69 -21
  9. tests/test_integrations.py +8 -11
  10. tests/test_python.py +109 -71
  11. tests/test_solutions.py +170 -159
  12. ultralytics/__init__.py +27 -9
  13. ultralytics/cfg/__init__.py +57 -64
  14. ultralytics/cfg/datasets/Argoverse.yaml +7 -6
  15. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  16. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  17. ultralytics/cfg/datasets/ImageNet.yaml +1 -1
  18. ultralytics/cfg/datasets/Objects365.yaml +19 -15
  19. ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
  20. ultralytics/cfg/datasets/VOC.yaml +19 -21
  21. ultralytics/cfg/datasets/VisDrone.yaml +5 -5
  22. ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
  23. ultralytics/cfg/datasets/coco-pose.yaml +24 -2
  24. ultralytics/cfg/datasets/coco.yaml +2 -2
  25. ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
  26. ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
  27. ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
  28. ultralytics/cfg/datasets/dog-pose.yaml +28 -0
  29. ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
  30. ultralytics/cfg/datasets/dota8.yaml +2 -2
  31. ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
  32. ultralytics/cfg/datasets/kitti.yaml +27 -0
  33. ultralytics/cfg/datasets/lvis.yaml +7 -7
  34. ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
  35. ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
  36. ultralytics/cfg/datasets/xView.yaml +16 -16
  37. ultralytics/cfg/default.yaml +96 -94
  38. ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
  39. ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
  40. ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
  41. ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
  42. ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
  43. ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
  44. ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
  45. ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
  46. ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
  47. ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
  48. ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
  49. ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
  50. ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
  51. ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
  52. ultralytics/cfg/models/v6/yolov6.yaml +1 -1
  53. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
  54. ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
  55. ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
  56. ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
  57. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
  58. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
  59. ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
  60. ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
  61. ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
  62. ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
  63. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
  64. ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
  65. ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
  66. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
  67. ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
  68. ultralytics/cfg/trackers/botsort.yaml +16 -17
  69. ultralytics/cfg/trackers/bytetrack.yaml +9 -11
  70. ultralytics/data/__init__.py +4 -4
  71. ultralytics/data/annotator.py +3 -4
  72. ultralytics/data/augment.py +286 -476
  73. ultralytics/data/base.py +18 -26
  74. ultralytics/data/build.py +151 -26
  75. ultralytics/data/converter.py +38 -50
  76. ultralytics/data/dataset.py +47 -75
  77. ultralytics/data/loaders.py +42 -49
  78. ultralytics/data/split.py +5 -6
  79. ultralytics/data/split_dota.py +8 -15
  80. ultralytics/data/utils.py +41 -45
  81. ultralytics/engine/exporter.py +462 -462
  82. ultralytics/engine/model.py +150 -191
  83. ultralytics/engine/predictor.py +30 -40
  84. ultralytics/engine/results.py +177 -311
  85. ultralytics/engine/trainer.py +193 -120
  86. ultralytics/engine/tuner.py +77 -63
  87. ultralytics/engine/validator.py +39 -22
  88. ultralytics/hub/__init__.py +16 -19
  89. ultralytics/hub/auth.py +6 -12
  90. ultralytics/hub/google/__init__.py +7 -10
  91. ultralytics/hub/session.py +15 -25
  92. ultralytics/hub/utils.py +5 -8
  93. ultralytics/models/__init__.py +1 -1
  94. ultralytics/models/fastsam/__init__.py +1 -1
  95. ultralytics/models/fastsam/model.py +8 -10
  96. ultralytics/models/fastsam/predict.py +19 -30
  97. ultralytics/models/fastsam/utils.py +1 -2
  98. ultralytics/models/fastsam/val.py +5 -7
  99. ultralytics/models/nas/__init__.py +1 -1
  100. ultralytics/models/nas/model.py +5 -8
  101. ultralytics/models/nas/predict.py +7 -9
  102. ultralytics/models/nas/val.py +1 -2
  103. ultralytics/models/rtdetr/__init__.py +1 -1
  104. ultralytics/models/rtdetr/model.py +7 -8
  105. ultralytics/models/rtdetr/predict.py +15 -19
  106. ultralytics/models/rtdetr/train.py +10 -13
  107. ultralytics/models/rtdetr/val.py +21 -23
  108. ultralytics/models/sam/__init__.py +15 -2
  109. ultralytics/models/sam/amg.py +14 -20
  110. ultralytics/models/sam/build.py +26 -19
  111. ultralytics/models/sam/build_sam3.py +377 -0
  112. ultralytics/models/sam/model.py +29 -32
  113. ultralytics/models/sam/modules/blocks.py +83 -144
  114. ultralytics/models/sam/modules/decoders.py +22 -40
  115. ultralytics/models/sam/modules/encoders.py +44 -101
  116. ultralytics/models/sam/modules/memory_attention.py +16 -30
  117. ultralytics/models/sam/modules/sam.py +206 -79
  118. ultralytics/models/sam/modules/tiny_encoder.py +64 -83
  119. ultralytics/models/sam/modules/transformer.py +18 -28
  120. ultralytics/models/sam/modules/utils.py +174 -50
  121. ultralytics/models/sam/predict.py +2268 -366
  122. ultralytics/models/sam/sam3/__init__.py +3 -0
  123. ultralytics/models/sam/sam3/decoder.py +546 -0
  124. ultralytics/models/sam/sam3/encoder.py +529 -0
  125. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  126. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  127. ultralytics/models/sam/sam3/model_misc.py +199 -0
  128. ultralytics/models/sam/sam3/necks.py +129 -0
  129. ultralytics/models/sam/sam3/sam3_image.py +339 -0
  130. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  131. ultralytics/models/sam/sam3/vitdet.py +547 -0
  132. ultralytics/models/sam/sam3/vl_combiner.py +160 -0
  133. ultralytics/models/utils/loss.py +14 -26
  134. ultralytics/models/utils/ops.py +13 -17
  135. ultralytics/models/yolo/__init__.py +1 -1
  136. ultralytics/models/yolo/classify/predict.py +9 -12
  137. ultralytics/models/yolo/classify/train.py +15 -41
  138. ultralytics/models/yolo/classify/val.py +34 -32
  139. ultralytics/models/yolo/detect/predict.py +8 -11
  140. ultralytics/models/yolo/detect/train.py +13 -32
  141. ultralytics/models/yolo/detect/val.py +75 -63
  142. ultralytics/models/yolo/model.py +37 -53
  143. ultralytics/models/yolo/obb/predict.py +5 -14
  144. ultralytics/models/yolo/obb/train.py +11 -14
  145. ultralytics/models/yolo/obb/val.py +42 -39
  146. ultralytics/models/yolo/pose/__init__.py +1 -1
  147. ultralytics/models/yolo/pose/predict.py +7 -22
  148. ultralytics/models/yolo/pose/train.py +10 -22
  149. ultralytics/models/yolo/pose/val.py +40 -59
  150. ultralytics/models/yolo/segment/predict.py +16 -20
  151. ultralytics/models/yolo/segment/train.py +3 -12
  152. ultralytics/models/yolo/segment/val.py +106 -56
  153. ultralytics/models/yolo/world/train.py +12 -16
  154. ultralytics/models/yolo/world/train_world.py +11 -34
  155. ultralytics/models/yolo/yoloe/__init__.py +7 -7
  156. ultralytics/models/yolo/yoloe/predict.py +16 -23
  157. ultralytics/models/yolo/yoloe/train.py +31 -56
  158. ultralytics/models/yolo/yoloe/train_seg.py +5 -10
  159. ultralytics/models/yolo/yoloe/val.py +16 -21
  160. ultralytics/nn/__init__.py +7 -7
  161. ultralytics/nn/autobackend.py +152 -80
  162. ultralytics/nn/modules/__init__.py +60 -60
  163. ultralytics/nn/modules/activation.py +4 -6
  164. ultralytics/nn/modules/block.py +133 -217
  165. ultralytics/nn/modules/conv.py +52 -97
  166. ultralytics/nn/modules/head.py +64 -116
  167. ultralytics/nn/modules/transformer.py +79 -89
  168. ultralytics/nn/modules/utils.py +16 -21
  169. ultralytics/nn/tasks.py +111 -156
  170. ultralytics/nn/text_model.py +40 -67
  171. ultralytics/solutions/__init__.py +12 -12
  172. ultralytics/solutions/ai_gym.py +11 -17
  173. ultralytics/solutions/analytics.py +15 -16
  174. ultralytics/solutions/config.py +5 -6
  175. ultralytics/solutions/distance_calculation.py +10 -13
  176. ultralytics/solutions/heatmap.py +7 -13
  177. ultralytics/solutions/instance_segmentation.py +5 -8
  178. ultralytics/solutions/object_blurrer.py +7 -10
  179. ultralytics/solutions/object_counter.py +12 -19
  180. ultralytics/solutions/object_cropper.py +8 -14
  181. ultralytics/solutions/parking_management.py +33 -31
  182. ultralytics/solutions/queue_management.py +10 -12
  183. ultralytics/solutions/region_counter.py +9 -12
  184. ultralytics/solutions/security_alarm.py +15 -20
  185. ultralytics/solutions/similarity_search.py +13 -17
  186. ultralytics/solutions/solutions.py +75 -74
  187. ultralytics/solutions/speed_estimation.py +7 -10
  188. ultralytics/solutions/streamlit_inference.py +4 -7
  189. ultralytics/solutions/templates/similarity-search.html +7 -18
  190. ultralytics/solutions/trackzone.py +7 -10
  191. ultralytics/solutions/vision_eye.py +5 -8
  192. ultralytics/trackers/__init__.py +1 -1
  193. ultralytics/trackers/basetrack.py +3 -5
  194. ultralytics/trackers/bot_sort.py +10 -27
  195. ultralytics/trackers/byte_tracker.py +14 -30
  196. ultralytics/trackers/track.py +3 -6
  197. ultralytics/trackers/utils/gmc.py +11 -22
  198. ultralytics/trackers/utils/kalman_filter.py +37 -48
  199. ultralytics/trackers/utils/matching.py +12 -15
  200. ultralytics/utils/__init__.py +116 -116
  201. ultralytics/utils/autobatch.py +2 -4
  202. ultralytics/utils/autodevice.py +17 -18
  203. ultralytics/utils/benchmarks.py +70 -70
  204. ultralytics/utils/callbacks/base.py +8 -10
  205. ultralytics/utils/callbacks/clearml.py +5 -13
  206. ultralytics/utils/callbacks/comet.py +32 -46
  207. ultralytics/utils/callbacks/dvc.py +13 -18
  208. ultralytics/utils/callbacks/mlflow.py +4 -5
  209. ultralytics/utils/callbacks/neptune.py +7 -15
  210. ultralytics/utils/callbacks/platform.py +314 -38
  211. ultralytics/utils/callbacks/raytune.py +3 -4
  212. ultralytics/utils/callbacks/tensorboard.py +23 -31
  213. ultralytics/utils/callbacks/wb.py +10 -13
  214. ultralytics/utils/checks.py +151 -87
  215. ultralytics/utils/cpu.py +3 -8
  216. ultralytics/utils/dist.py +19 -15
  217. ultralytics/utils/downloads.py +29 -41
  218. ultralytics/utils/errors.py +6 -14
  219. ultralytics/utils/events.py +2 -4
  220. ultralytics/utils/export/__init__.py +7 -0
  221. ultralytics/utils/{export.py → export/engine.py} +16 -16
  222. ultralytics/utils/export/imx.py +325 -0
  223. ultralytics/utils/export/tensorflow.py +231 -0
  224. ultralytics/utils/files.py +24 -28
  225. ultralytics/utils/git.py +9 -11
  226. ultralytics/utils/instance.py +30 -51
  227. ultralytics/utils/logger.py +212 -114
  228. ultralytics/utils/loss.py +15 -24
  229. ultralytics/utils/metrics.py +131 -160
  230. ultralytics/utils/nms.py +21 -30
  231. ultralytics/utils/ops.py +107 -165
  232. ultralytics/utils/patches.py +33 -21
  233. ultralytics/utils/plotting.py +122 -119
  234. ultralytics/utils/tal.py +28 -44
  235. ultralytics/utils/torch_utils.py +70 -187
  236. ultralytics/utils/tqdm.py +20 -20
  237. ultralytics/utils/triton.py +13 -19
  238. ultralytics/utils/tuner.py +17 -5
  239. dgenerate_ultralytics_headless-8.3.196.dist-info/RECORD +0 -281
  240. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/WHEEL +0 -0
  241. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/entry_points.txt +0 -0
  242. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/licenses/LICENSE +0 -0
  243. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/top_level.txt +0 -0
@@ -10,26 +10,27 @@ import torch.nn as nn
10
10
  import torch.nn.functional as F
11
11
  from torch.nn.init import constant_, xavier_uniform_
12
12
 
13
+ from ultralytics.utils.torch_utils import TORCH_1_11
14
+
13
15
  from .conv import Conv
14
16
  from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
15
17
 
16
18
  __all__ = (
17
- "TransformerEncoderLayer",
18
- "TransformerLayer",
19
- "TransformerBlock",
20
- "MLPBlock",
21
- "LayerNorm2d",
22
19
  "AIFI",
20
+ "MLP",
23
21
  "DeformableTransformerDecoder",
24
22
  "DeformableTransformerDecoderLayer",
23
+ "LayerNorm2d",
24
+ "MLPBlock",
25
25
  "MSDeformAttn",
26
- "MLP",
26
+ "TransformerBlock",
27
+ "TransformerEncoderLayer",
28
+ "TransformerLayer",
27
29
  )
28
30
 
29
31
 
30
32
  class TransformerEncoderLayer(nn.Module):
31
- """
32
- A single layer of the transformer encoder.
33
+ """A single layer of the transformer encoder.
33
34
 
34
35
  This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
35
36
  supporting both pre-normalization and post-normalization configurations.
@@ -56,8 +57,7 @@ class TransformerEncoderLayer(nn.Module):
56
57
  act: nn.Module = nn.GELU(),
57
58
  normalize_before: bool = False,
58
59
  ):
59
- """
60
- Initialize the TransformerEncoderLayer with specified parameters.
60
+ """Initialize the TransformerEncoderLayer with specified parameters.
61
61
 
62
62
  Args:
63
63
  c1 (int): Input dimension.
@@ -100,8 +100,7 @@ class TransformerEncoderLayer(nn.Module):
100
100
  src_key_padding_mask: torch.Tensor | None = None,
101
101
  pos: torch.Tensor | None = None,
102
102
  ) -> torch.Tensor:
103
- """
104
- Perform forward pass with post-normalization.
103
+ """Perform forward pass with post-normalization.
105
104
 
106
105
  Args:
107
106
  src (torch.Tensor): Input tensor.
@@ -127,8 +126,7 @@ class TransformerEncoderLayer(nn.Module):
127
126
  src_key_padding_mask: torch.Tensor | None = None,
128
127
  pos: torch.Tensor | None = None,
129
128
  ) -> torch.Tensor:
130
- """
131
- Perform forward pass with pre-normalization.
129
+ """Perform forward pass with pre-normalization.
132
130
 
133
131
  Args:
134
132
  src (torch.Tensor): Input tensor.
@@ -154,8 +152,7 @@ class TransformerEncoderLayer(nn.Module):
154
152
  src_key_padding_mask: torch.Tensor | None = None,
155
153
  pos: torch.Tensor | None = None,
156
154
  ) -> torch.Tensor:
157
- """
158
- Forward propagate the input through the encoder module.
155
+ """Forward propagate the input through the encoder module.
159
156
 
160
157
  Args:
161
158
  src (torch.Tensor): Input tensor.
@@ -172,8 +169,7 @@ class TransformerEncoderLayer(nn.Module):
172
169
 
173
170
 
174
171
  class AIFI(TransformerEncoderLayer):
175
- """
176
- AIFI transformer layer for 2D data with positional embeddings.
172
+ """AIFI transformer layer for 2D data with positional embeddings.
177
173
 
178
174
  This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
179
175
  embeddings and handling the spatial dimensions appropriately.
@@ -188,8 +184,7 @@ class AIFI(TransformerEncoderLayer):
188
184
  act: nn.Module = nn.GELU(),
189
185
  normalize_before: bool = False,
190
186
  ):
191
- """
192
- Initialize the AIFI instance with specified parameters.
187
+ """Initialize the AIFI instance with specified parameters.
193
188
 
194
189
  Args:
195
190
  c1 (int): Input dimension.
@@ -202,8 +197,7 @@ class AIFI(TransformerEncoderLayer):
202
197
  super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
203
198
 
204
199
  def forward(self, x: torch.Tensor) -> torch.Tensor:
205
- """
206
- Forward pass for the AIFI transformer layer.
200
+ """Forward pass for the AIFI transformer layer.
207
201
 
208
202
  Args:
209
203
  x (torch.Tensor): Input tensor with shape [B, C, H, W].
@@ -221,8 +215,7 @@ class AIFI(TransformerEncoderLayer):
221
215
  def build_2d_sincos_position_embedding(
222
216
  w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
223
217
  ) -> torch.Tensor:
224
- """
225
- Build 2D sine-cosine position embedding.
218
+ """Build 2D sine-cosine position embedding.
226
219
 
227
220
  Args:
228
221
  w (int): Width of the feature map.
@@ -236,7 +229,7 @@ class AIFI(TransformerEncoderLayer):
236
229
  assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
237
230
  grid_w = torch.arange(w, dtype=torch.float32)
238
231
  grid_h = torch.arange(h, dtype=torch.float32)
239
- grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
232
+ grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") if TORCH_1_11 else torch.meshgrid(grid_w, grid_h)
240
233
  pos_dim = embed_dim // 4
241
234
  omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
242
235
  omega = 1.0 / (temperature**omega)
@@ -251,8 +244,7 @@ class TransformerLayer(nn.Module):
251
244
  """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
252
245
 
253
246
  def __init__(self, c: int, num_heads: int):
254
- """
255
- Initialize a self-attention mechanism using linear transformations and multi-head attention.
247
+ """Initialize a self-attention mechanism using linear transformations and multi-head attention.
256
248
 
257
249
  Args:
258
250
  c (int): Input and output channel dimension.
@@ -267,8 +259,7 @@ class TransformerLayer(nn.Module):
267
259
  self.fc2 = nn.Linear(c, c, bias=False)
268
260
 
269
261
  def forward(self, x: torch.Tensor) -> torch.Tensor:
270
- """
271
- Apply a transformer block to the input x and return the output.
262
+ """Apply a transformer block to the input x and return the output.
272
263
 
273
264
  Args:
274
265
  x (torch.Tensor): Input tensor.
@@ -281,11 +272,10 @@ class TransformerLayer(nn.Module):
281
272
 
282
273
 
283
274
  class TransformerBlock(nn.Module):
284
- """
285
- Vision Transformer block based on https://arxiv.org/abs/2010.11929.
275
+ """Vision Transformer block based on https://arxiv.org/abs/2010.11929.
286
276
 
287
- This class implements a complete transformer block with optional convolution layer for channel adjustment,
288
- learnable position embedding, and multiple transformer layers.
277
+ This class implements a complete transformer block with optional convolution layer for channel adjustment, learnable
278
+ position embedding, and multiple transformer layers.
289
279
 
290
280
  Attributes:
291
281
  conv (Conv, optional): Convolution layer if input and output channels differ.
@@ -295,8 +285,7 @@ class TransformerBlock(nn.Module):
295
285
  """
296
286
 
297
287
  def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
298
- """
299
- Initialize a Transformer module with position embedding and specified number of heads and layers.
288
+ """Initialize a Transformer module with position embedding and specified number of heads and layers.
300
289
 
301
290
  Args:
302
291
  c1 (int): Input channel dimension.
@@ -313,28 +302,26 @@ class TransformerBlock(nn.Module):
313
302
  self.c2 = c2
314
303
 
315
304
  def forward(self, x: torch.Tensor) -> torch.Tensor:
316
- """
317
- Forward propagate the input through the transformer block.
305
+ """Forward propagate the input through the transformer block.
318
306
 
319
307
  Args:
320
- x (torch.Tensor): Input tensor with shape [b, c1, w, h].
308
+ x (torch.Tensor): Input tensor with shape [b, c1, h, w].
321
309
 
322
310
  Returns:
323
- (torch.Tensor): Output tensor with shape [b, c2, w, h].
311
+ (torch.Tensor): Output tensor with shape [b, c2, h, w].
324
312
  """
325
313
  if self.conv is not None:
326
314
  x = self.conv(x)
327
- b, _, w, h = x.shape
315
+ b, _, h, w = x.shape
328
316
  p = x.flatten(2).permute(2, 0, 1)
329
- return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
317
+ return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, h, w)
330
318
 
331
319
 
332
320
  class MLPBlock(nn.Module):
333
321
  """A single block of a multi-layer perceptron."""
334
322
 
335
323
  def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
336
- """
337
- Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
324
+ """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
338
325
 
339
326
  Args:
340
327
  embedding_dim (int): Input and output dimension.
@@ -347,8 +334,7 @@ class MLPBlock(nn.Module):
347
334
  self.act = act()
348
335
 
349
336
  def forward(self, x: torch.Tensor) -> torch.Tensor:
350
- """
351
- Forward pass for the MLPBlock.
337
+ """Forward pass for the MLPBlock.
352
338
 
353
339
  Args:
354
340
  x (torch.Tensor): Input tensor.
@@ -360,11 +346,10 @@ class MLPBlock(nn.Module):
360
346
 
361
347
 
362
348
  class MLP(nn.Module):
363
- """
364
- A simple multi-layer perceptron (also called FFN).
349
+ """A simple multi-layer perceptron (also called FFN).
365
350
 
366
- This class implements a configurable MLP with multiple linear layers, activation functions, and optional
367
- sigmoid output activation.
351
+ This class implements a configurable MLP with multiple linear layers, activation functions, and optional sigmoid
352
+ output activation.
368
353
 
369
354
  Attributes:
370
355
  num_layers (int): Number of layers in the MLP.
@@ -374,10 +359,17 @@ class MLP(nn.Module):
374
359
  """
375
360
 
376
361
  def __init__(
377
- self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
362
+ self,
363
+ input_dim: int,
364
+ hidden_dim: int,
365
+ output_dim: int,
366
+ num_layers: int,
367
+ act=nn.ReLU,
368
+ sigmoid: bool = False,
369
+ residual: bool = False,
370
+ out_norm: nn.Module = None,
378
371
  ):
379
- """
380
- Initialize the MLP with specified input, hidden, output dimensions and number of layers.
372
+ """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
381
373
 
382
374
  Args:
383
375
  input_dim (int): Input dimension.
@@ -386,17 +378,24 @@ class MLP(nn.Module):
386
378
  num_layers (int): Number of layers.
387
379
  act (nn.Module): Activation function.
388
380
  sigmoid (bool): Whether to apply sigmoid to the output.
381
+ residual (bool): Whether to use residual connections.
382
+ out_norm (nn.Module, optional): Normalization layer for the output.
389
383
  """
390
384
  super().__init__()
391
385
  self.num_layers = num_layers
392
386
  h = [hidden_dim] * (num_layers - 1)
393
- self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
387
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
394
388
  self.sigmoid = sigmoid
395
389
  self.act = act()
390
+ if residual and input_dim != output_dim:
391
+ raise ValueError("residual is only supported if input_dim == output_dim")
392
+ self.residual = residual
393
+ # whether to apply a normalization layer to the output
394
+ assert isinstance(out_norm, nn.Module) or out_norm is None
395
+ self.out_norm = out_norm or nn.Identity()
396
396
 
397
397
  def forward(self, x: torch.Tensor) -> torch.Tensor:
398
- """
399
- Forward pass for the entire MLP.
398
+ """Forward pass for the entire MLP.
400
399
 
401
400
  Args:
402
401
  x (torch.Tensor): Input tensor.
@@ -404,17 +403,20 @@ class MLP(nn.Module):
404
403
  Returns:
405
404
  (torch.Tensor): Output tensor after MLP.
406
405
  """
406
+ orig_x = x
407
407
  for i, layer in enumerate(self.layers):
408
408
  x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
409
+ if getattr(self, "residual", False):
410
+ x = x + orig_x
411
+ x = getattr(self, "out_norm", nn.Identity())(x)
409
412
  return x.sigmoid() if getattr(self, "sigmoid", False) else x
410
413
 
411
414
 
412
415
  class LayerNorm2d(nn.Module):
413
- """
414
- 2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
416
+ """2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
415
417
 
416
- This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
417
- while preserving spatial dimensions.
418
+ This class implements layer normalization for 2D feature maps, normalizing across the channel dimension while
419
+ preserving spatial dimensions.
418
420
 
419
421
  Attributes:
420
422
  weight (nn.Parameter): Learnable scale parameter.
@@ -427,8 +429,7 @@ class LayerNorm2d(nn.Module):
427
429
  """
428
430
 
429
431
  def __init__(self, num_channels: int, eps: float = 1e-6):
430
- """
431
- Initialize LayerNorm2d with the given parameters.
432
+ """Initialize LayerNorm2d with the given parameters.
432
433
 
433
434
  Args:
434
435
  num_channels (int): Number of channels in the input.
@@ -440,8 +441,7 @@ class LayerNorm2d(nn.Module):
440
441
  self.eps = eps
441
442
 
442
443
  def forward(self, x: torch.Tensor) -> torch.Tensor:
443
- """
444
- Perform forward pass for 2D layer normalization.
444
+ """Perform forward pass for 2D layer normalization.
445
445
 
446
446
  Args:
447
447
  x (torch.Tensor): Input tensor.
@@ -456,11 +456,10 @@ class LayerNorm2d(nn.Module):
456
456
 
457
457
 
458
458
  class MSDeformAttn(nn.Module):
459
- """
460
- Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
459
+ """Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
461
460
 
462
- This module implements multiscale deformable attention that can attend to features at multiple scales
463
- with learnable sampling locations and attention weights.
461
+ This module implements multiscale deformable attention that can attend to features at multiple scales with learnable
462
+ sampling locations and attention weights.
464
463
 
465
464
  Attributes:
466
465
  im2col_step (int): Step size for im2col operations.
@@ -478,8 +477,7 @@ class MSDeformAttn(nn.Module):
478
477
  """
479
478
 
480
479
  def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
481
- """
482
- Initialize MSDeformAttn with the given parameters.
480
+ """Initialize MSDeformAttn with the given parameters.
483
481
 
484
482
  Args:
485
483
  d_model (int): Model dimension.
@@ -537,13 +535,12 @@ class MSDeformAttn(nn.Module):
537
535
  value_shapes: list,
538
536
  value_mask: torch.Tensor | None = None,
539
537
  ) -> torch.Tensor:
540
- """
541
- Perform forward pass for multiscale deformable attention.
538
+ """Perform forward pass for multiscale deformable attention.
542
539
 
543
540
  Args:
544
541
  query (torch.Tensor): Query tensor with shape [bs, query_length, C].
545
- refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
546
- range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
542
+ refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2], range in [0,
543
+ 1], top-left (0,0), bottom-right (1, 1), including padding area.
547
544
  value (torch.Tensor): Value tensor with shape [bs, value_length, C].
548
545
  value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
549
546
  value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
@@ -582,8 +579,7 @@ class MSDeformAttn(nn.Module):
582
579
 
583
580
 
584
581
  class DeformableTransformerDecoderLayer(nn.Module):
585
- """
586
- Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
582
+ """Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
587
583
 
588
584
  This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
589
585
  attention, and a feedforward network.
@@ -617,8 +613,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
617
613
  n_levels: int = 4,
618
614
  n_points: int = 4,
619
615
  ):
620
- """
621
- Initialize the DeformableTransformerDecoderLayer with the given parameters.
616
+ """Initialize the DeformableTransformerDecoderLayer with the given parameters.
622
617
 
623
618
  Args:
624
619
  d_model (int): Model dimension.
@@ -655,8 +650,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
655
650
  return tensor if pos is None else tensor + pos
656
651
 
657
652
  def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
658
- """
659
- Perform forward pass through the Feed-Forward Network part of the layer.
653
+ """Perform forward pass through the Feed-Forward Network part of the layer.
660
654
 
661
655
  Args:
662
656
  tgt (torch.Tensor): Input tensor.
@@ -678,8 +672,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
678
672
  attn_mask: torch.Tensor | None = None,
679
673
  query_pos: torch.Tensor | None = None,
680
674
  ) -> torch.Tensor:
681
- """
682
- Perform the forward pass through the entire decoder layer.
675
+ """Perform the forward pass through the entire decoder layer.
683
676
 
684
677
  Args:
685
678
  embed (torch.Tensor): Input embeddings.
@@ -713,11 +706,10 @@ class DeformableTransformerDecoderLayer(nn.Module):
713
706
 
714
707
 
715
708
  class DeformableTransformerDecoder(nn.Module):
716
- """
717
- Deformable Transformer Decoder based on PaddleDetection implementation.
709
+ """Deformable Transformer Decoder based on PaddleDetection implementation.
718
710
 
719
- This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
720
- heads for bounding box regression and classification.
711
+ This class implements a complete deformable transformer decoder with multiple decoder layers and prediction heads
712
+ for bounding box regression and classification.
721
713
 
722
714
  Attributes:
723
715
  layers (nn.ModuleList): List of decoder layers.
@@ -730,8 +722,7 @@ class DeformableTransformerDecoder(nn.Module):
730
722
  """
731
723
 
732
724
  def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
733
- """
734
- Initialize the DeformableTransformerDecoder with the given parameters.
725
+ """Initialize the DeformableTransformerDecoder with the given parameters.
735
726
 
736
727
  Args:
737
728
  hidden_dim (int): Hidden dimension.
@@ -757,8 +748,7 @@ class DeformableTransformerDecoder(nn.Module):
757
748
  attn_mask: torch.Tensor | None = None,
758
749
  padding_mask: torch.Tensor | None = None,
759
750
  ):
760
- """
761
- Perform the forward pass through the entire decoder.
751
+ """Perform the forward pass through the entire decoder.
762
752
 
763
753
  Args:
764
754
  embed (torch.Tensor): Decoder embeddings.
@@ -9,12 +9,11 @@ import torch.nn as nn
9
9
  import torch.nn.functional as F
10
10
  from torch.nn.init import uniform_
11
11
 
12
- __all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
12
+ __all__ = "inverse_sigmoid", "multi_scale_deformable_attn_pytorch"
13
13
 
14
14
 
15
15
  def _get_clones(module, n):
16
- """
17
- Create a list of cloned modules from the given module.
16
+ """Create a list of cloned modules from the given module.
18
17
 
19
18
  Args:
20
19
  module (nn.Module): The module to be cloned.
@@ -34,12 +33,11 @@ def _get_clones(module, n):
34
33
 
35
34
 
36
35
  def bias_init_with_prob(prior_prob=0.01):
37
- """
38
- Initialize conv/fc bias value according to a given probability value.
36
+ """Initialize conv/fc bias value according to a given probability value.
39
37
 
40
- This function calculates the bias initialization value based on a prior probability using the inverse error function.
41
- It's commonly used in object detection models to initialize classification layers with a specific positive prediction
42
- probability.
38
+ This function calculates the bias initialization value based on a prior probability using the inverse error
39
+ function. It's commonly used in object detection models to initialize classification layers with a specific positive
40
+ prediction probability.
43
41
 
44
42
  Args:
45
43
  prior_prob (float, optional): Prior probability for bias initialization.
@@ -56,11 +54,10 @@ def bias_init_with_prob(prior_prob=0.01):
56
54
 
57
55
 
58
56
  def linear_init(module):
59
- """
60
- Initialize the weights and biases of a linear module.
57
+ """Initialize the weights and biases of a linear module.
61
58
 
62
- This function initializes the weights of a linear module using a uniform distribution within bounds calculated
63
- from the input dimension. If the module has a bias, it is also initialized.
59
+ This function initializes the weights of a linear module using a uniform distribution within bounds calculated from
60
+ the input dimension. If the module has a bias, it is also initialized.
64
61
 
65
62
  Args:
66
63
  module (nn.Module): Linear module to initialize.
@@ -80,8 +77,7 @@ def linear_init(module):
80
77
 
81
78
 
82
79
  def inverse_sigmoid(x, eps=1e-5):
83
- """
84
- Calculate the inverse sigmoid function for a tensor.
80
+ """Calculate the inverse sigmoid function for a tensor.
85
81
 
86
82
  This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
87
83
  operations, particularly in attention mechanisms and coordinate transformations.
@@ -110,8 +106,7 @@ def multi_scale_deformable_attn_pytorch(
110
106
  sampling_locations: torch.Tensor,
111
107
  attention_weights: torch.Tensor,
112
108
  ) -> torch.Tensor:
113
- """
114
- Implement multi-scale deformable attention in PyTorch.
109
+ """Implement multi-scale deformable attention in PyTorch.
115
110
 
116
111
  This function performs deformable attention across multiple feature map scales, allowing the model to attend to
117
112
  different spatial locations with learned offsets.
@@ -119,10 +114,10 @@ def multi_scale_deformable_attn_pytorch(
119
114
  Args:
120
115
  value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
121
116
  value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
122
- sampling_locations (torch.Tensor): The sampling locations with shape
123
- (bs, num_queries, num_heads, num_levels, num_points, 2).
124
- attention_weights (torch.Tensor): The attention weights with shape
125
- (bs, num_queries, num_heads, num_levels, num_points).
117
+ sampling_locations (torch.Tensor): The sampling locations with shape (bs, num_queries, num_heads, num_levels,
118
+ num_points, 2).
119
+ attention_weights (torch.Tensor): The attention weights with shape (bs, num_queries, num_heads, num_levels,
120
+ num_points).
126
121
 
127
122
  Returns:
128
123
  (torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
@@ -152,7 +147,7 @@ def multi_scale_deformable_attn_pytorch(
152
147
  sampling_value_list.append(sampling_value_l_)
153
148
  # (bs, num_queries, num_heads, num_levels, num_points) ->
154
149
  # (bs, num_heads, num_queries, num_levels, num_points) ->
155
- # (bs, num_heads, 1, num_queries, num_levels*num_points)
150
+ # (bs*num_heads, 1, num_queries, num_levels*num_points)
156
151
  attention_weights = attention_weights.transpose(1, 2).reshape(
157
152
  bs * num_heads, 1, num_queries, num_levels * num_points
158
153
  )