dgenerate-ultralytics-headless 8.3.214__py3-none-any.whl → 8.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/METADATA +64 -74
  2. dgenerate_ultralytics_headless-8.4.7.dist-info/RECORD +311 -0
  3. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/WHEEL +1 -1
  4. tests/__init__.py +7 -9
  5. tests/conftest.py +8 -15
  6. tests/test_cli.py +1 -1
  7. tests/test_cuda.py +13 -10
  8. tests/test_engine.py +9 -9
  9. tests/test_exports.py +65 -13
  10. tests/test_integrations.py +13 -13
  11. tests/test_python.py +125 -69
  12. tests/test_solutions.py +161 -152
  13. ultralytics/__init__.py +1 -1
  14. ultralytics/cfg/__init__.py +86 -92
  15. ultralytics/cfg/datasets/Argoverse.yaml +7 -6
  16. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  17. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  18. ultralytics/cfg/datasets/ImageNet.yaml +1 -1
  19. ultralytics/cfg/datasets/TT100K.yaml +346 -0
  20. ultralytics/cfg/datasets/VOC.yaml +15 -16
  21. ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
  22. ultralytics/cfg/datasets/coco-pose.yaml +21 -0
  23. ultralytics/cfg/datasets/coco12-formats.yaml +101 -0
  24. ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
  25. ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
  26. ultralytics/cfg/datasets/dog-pose.yaml +28 -0
  27. ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
  28. ultralytics/cfg/datasets/dota8.yaml +2 -2
  29. ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
  30. ultralytics/cfg/datasets/kitti.yaml +27 -0
  31. ultralytics/cfg/datasets/lvis.yaml +5 -5
  32. ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
  33. ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
  34. ultralytics/cfg/datasets/xView.yaml +16 -16
  35. ultralytics/cfg/default.yaml +4 -2
  36. ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
  37. ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
  38. ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
  39. ultralytics/cfg/models/26/yolo26-cls.yaml +33 -0
  40. ultralytics/cfg/models/26/yolo26-obb.yaml +52 -0
  41. ultralytics/cfg/models/26/yolo26-p2.yaml +60 -0
  42. ultralytics/cfg/models/26/yolo26-p6.yaml +62 -0
  43. ultralytics/cfg/models/26/yolo26-pose.yaml +53 -0
  44. ultralytics/cfg/models/26/yolo26-seg.yaml +52 -0
  45. ultralytics/cfg/models/26/yolo26.yaml +52 -0
  46. ultralytics/cfg/models/26/yoloe-26-seg.yaml +53 -0
  47. ultralytics/cfg/models/26/yoloe-26.yaml +53 -0
  48. ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
  49. ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
  50. ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
  51. ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
  52. ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
  53. ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
  54. ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
  55. ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
  56. ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
  57. ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
  58. ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
  59. ultralytics/cfg/models/v6/yolov6.yaml +1 -1
  60. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
  61. ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
  62. ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
  63. ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
  64. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
  65. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
  66. ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
  67. ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
  68. ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
  69. ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
  70. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
  71. ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
  72. ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
  73. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
  74. ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
  75. ultralytics/data/__init__.py +4 -4
  76. ultralytics/data/annotator.py +5 -6
  77. ultralytics/data/augment.py +300 -475
  78. ultralytics/data/base.py +18 -26
  79. ultralytics/data/build.py +147 -25
  80. ultralytics/data/converter.py +108 -87
  81. ultralytics/data/dataset.py +47 -75
  82. ultralytics/data/loaders.py +42 -49
  83. ultralytics/data/split.py +5 -6
  84. ultralytics/data/split_dota.py +8 -15
  85. ultralytics/data/utils.py +36 -45
  86. ultralytics/engine/exporter.py +351 -263
  87. ultralytics/engine/model.py +186 -225
  88. ultralytics/engine/predictor.py +45 -54
  89. ultralytics/engine/results.py +198 -325
  90. ultralytics/engine/trainer.py +165 -106
  91. ultralytics/engine/tuner.py +41 -43
  92. ultralytics/engine/validator.py +55 -38
  93. ultralytics/hub/__init__.py +16 -19
  94. ultralytics/hub/auth.py +6 -12
  95. ultralytics/hub/google/__init__.py +7 -10
  96. ultralytics/hub/session.py +15 -25
  97. ultralytics/hub/utils.py +5 -8
  98. ultralytics/models/__init__.py +1 -1
  99. ultralytics/models/fastsam/__init__.py +1 -1
  100. ultralytics/models/fastsam/model.py +8 -10
  101. ultralytics/models/fastsam/predict.py +18 -30
  102. ultralytics/models/fastsam/utils.py +1 -2
  103. ultralytics/models/fastsam/val.py +5 -7
  104. ultralytics/models/nas/__init__.py +1 -1
  105. ultralytics/models/nas/model.py +5 -8
  106. ultralytics/models/nas/predict.py +7 -9
  107. ultralytics/models/nas/val.py +1 -2
  108. ultralytics/models/rtdetr/__init__.py +1 -1
  109. ultralytics/models/rtdetr/model.py +5 -8
  110. ultralytics/models/rtdetr/predict.py +15 -19
  111. ultralytics/models/rtdetr/train.py +10 -13
  112. ultralytics/models/rtdetr/val.py +21 -23
  113. ultralytics/models/sam/__init__.py +15 -2
  114. ultralytics/models/sam/amg.py +14 -20
  115. ultralytics/models/sam/build.py +26 -19
  116. ultralytics/models/sam/build_sam3.py +377 -0
  117. ultralytics/models/sam/model.py +29 -32
  118. ultralytics/models/sam/modules/blocks.py +83 -144
  119. ultralytics/models/sam/modules/decoders.py +19 -37
  120. ultralytics/models/sam/modules/encoders.py +44 -101
  121. ultralytics/models/sam/modules/memory_attention.py +16 -30
  122. ultralytics/models/sam/modules/sam.py +200 -73
  123. ultralytics/models/sam/modules/tiny_encoder.py +64 -83
  124. ultralytics/models/sam/modules/transformer.py +18 -28
  125. ultralytics/models/sam/modules/utils.py +174 -50
  126. ultralytics/models/sam/predict.py +2248 -350
  127. ultralytics/models/sam/sam3/__init__.py +3 -0
  128. ultralytics/models/sam/sam3/decoder.py +546 -0
  129. ultralytics/models/sam/sam3/encoder.py +529 -0
  130. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  131. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  132. ultralytics/models/sam/sam3/model_misc.py +199 -0
  133. ultralytics/models/sam/sam3/necks.py +129 -0
  134. ultralytics/models/sam/sam3/sam3_image.py +339 -0
  135. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  136. ultralytics/models/sam/sam3/vitdet.py +547 -0
  137. ultralytics/models/sam/sam3/vl_combiner.py +160 -0
  138. ultralytics/models/utils/loss.py +14 -26
  139. ultralytics/models/utils/ops.py +13 -17
  140. ultralytics/models/yolo/__init__.py +1 -1
  141. ultralytics/models/yolo/classify/predict.py +10 -13
  142. ultralytics/models/yolo/classify/train.py +12 -33
  143. ultralytics/models/yolo/classify/val.py +30 -29
  144. ultralytics/models/yolo/detect/predict.py +9 -12
  145. ultralytics/models/yolo/detect/train.py +17 -23
  146. ultralytics/models/yolo/detect/val.py +77 -59
  147. ultralytics/models/yolo/model.py +43 -60
  148. ultralytics/models/yolo/obb/predict.py +7 -16
  149. ultralytics/models/yolo/obb/train.py +14 -17
  150. ultralytics/models/yolo/obb/val.py +40 -37
  151. ultralytics/models/yolo/pose/__init__.py +1 -1
  152. ultralytics/models/yolo/pose/predict.py +7 -22
  153. ultralytics/models/yolo/pose/train.py +13 -16
  154. ultralytics/models/yolo/pose/val.py +39 -58
  155. ultralytics/models/yolo/segment/predict.py +17 -21
  156. ultralytics/models/yolo/segment/train.py +7 -10
  157. ultralytics/models/yolo/segment/val.py +95 -47
  158. ultralytics/models/yolo/world/train.py +8 -14
  159. ultralytics/models/yolo/world/train_world.py +11 -34
  160. ultralytics/models/yolo/yoloe/__init__.py +7 -7
  161. ultralytics/models/yolo/yoloe/predict.py +16 -23
  162. ultralytics/models/yolo/yoloe/train.py +36 -44
  163. ultralytics/models/yolo/yoloe/train_seg.py +11 -11
  164. ultralytics/models/yolo/yoloe/val.py +15 -20
  165. ultralytics/nn/__init__.py +7 -7
  166. ultralytics/nn/autobackend.py +159 -85
  167. ultralytics/nn/modules/__init__.py +68 -60
  168. ultralytics/nn/modules/activation.py +4 -6
  169. ultralytics/nn/modules/block.py +260 -224
  170. ultralytics/nn/modules/conv.py +52 -97
  171. ultralytics/nn/modules/head.py +831 -299
  172. ultralytics/nn/modules/transformer.py +76 -88
  173. ultralytics/nn/modules/utils.py +16 -21
  174. ultralytics/nn/tasks.py +180 -195
  175. ultralytics/nn/text_model.py +45 -69
  176. ultralytics/optim/__init__.py +5 -0
  177. ultralytics/optim/muon.py +338 -0
  178. ultralytics/solutions/__init__.py +12 -12
  179. ultralytics/solutions/ai_gym.py +13 -19
  180. ultralytics/solutions/analytics.py +15 -16
  181. ultralytics/solutions/config.py +6 -7
  182. ultralytics/solutions/distance_calculation.py +10 -13
  183. ultralytics/solutions/heatmap.py +8 -14
  184. ultralytics/solutions/instance_segmentation.py +6 -9
  185. ultralytics/solutions/object_blurrer.py +7 -10
  186. ultralytics/solutions/object_counter.py +12 -19
  187. ultralytics/solutions/object_cropper.py +8 -14
  188. ultralytics/solutions/parking_management.py +34 -32
  189. ultralytics/solutions/queue_management.py +10 -12
  190. ultralytics/solutions/region_counter.py +9 -12
  191. ultralytics/solutions/security_alarm.py +15 -20
  192. ultralytics/solutions/similarity_search.py +10 -15
  193. ultralytics/solutions/solutions.py +77 -76
  194. ultralytics/solutions/speed_estimation.py +7 -10
  195. ultralytics/solutions/streamlit_inference.py +2 -4
  196. ultralytics/solutions/templates/similarity-search.html +7 -18
  197. ultralytics/solutions/trackzone.py +7 -10
  198. ultralytics/solutions/vision_eye.py +5 -8
  199. ultralytics/trackers/__init__.py +1 -1
  200. ultralytics/trackers/basetrack.py +3 -5
  201. ultralytics/trackers/bot_sort.py +10 -27
  202. ultralytics/trackers/byte_tracker.py +21 -37
  203. ultralytics/trackers/track.py +4 -7
  204. ultralytics/trackers/utils/gmc.py +11 -22
  205. ultralytics/trackers/utils/kalman_filter.py +37 -48
  206. ultralytics/trackers/utils/matching.py +12 -15
  207. ultralytics/utils/__init__.py +124 -124
  208. ultralytics/utils/autobatch.py +2 -4
  209. ultralytics/utils/autodevice.py +17 -18
  210. ultralytics/utils/benchmarks.py +57 -71
  211. ultralytics/utils/callbacks/base.py +8 -10
  212. ultralytics/utils/callbacks/clearml.py +5 -13
  213. ultralytics/utils/callbacks/comet.py +32 -46
  214. ultralytics/utils/callbacks/dvc.py +13 -18
  215. ultralytics/utils/callbacks/mlflow.py +4 -5
  216. ultralytics/utils/callbacks/neptune.py +7 -15
  217. ultralytics/utils/callbacks/platform.py +423 -38
  218. ultralytics/utils/callbacks/raytune.py +3 -4
  219. ultralytics/utils/callbacks/tensorboard.py +25 -31
  220. ultralytics/utils/callbacks/wb.py +16 -14
  221. ultralytics/utils/checks.py +127 -85
  222. ultralytics/utils/cpu.py +3 -8
  223. ultralytics/utils/dist.py +9 -12
  224. ultralytics/utils/downloads.py +25 -33
  225. ultralytics/utils/errors.py +6 -14
  226. ultralytics/utils/events.py +2 -4
  227. ultralytics/utils/export/__init__.py +4 -236
  228. ultralytics/utils/export/engine.py +246 -0
  229. ultralytics/utils/export/imx.py +117 -63
  230. ultralytics/utils/export/tensorflow.py +231 -0
  231. ultralytics/utils/files.py +26 -30
  232. ultralytics/utils/git.py +9 -11
  233. ultralytics/utils/instance.py +30 -51
  234. ultralytics/utils/logger.py +212 -114
  235. ultralytics/utils/loss.py +601 -215
  236. ultralytics/utils/metrics.py +128 -156
  237. ultralytics/utils/nms.py +13 -16
  238. ultralytics/utils/ops.py +117 -166
  239. ultralytics/utils/patches.py +75 -21
  240. ultralytics/utils/plotting.py +75 -80
  241. ultralytics/utils/tal.py +125 -59
  242. ultralytics/utils/torch_utils.py +53 -79
  243. ultralytics/utils/tqdm.py +24 -21
  244. ultralytics/utils/triton.py +13 -19
  245. ultralytics/utils/tuner.py +19 -10
  246. dgenerate_ultralytics_headless-8.3.214.dist-info/RECORD +0 -283
  247. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/entry_points.txt +0 -0
  248. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/licenses/LICENSE +0 -0
  249. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/top_level.txt +0 -0
@@ -16,22 +16,21 @@ from .conv import Conv
16
16
  from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
17
17
 
18
18
  __all__ = (
19
- "TransformerEncoderLayer",
20
- "TransformerLayer",
21
- "TransformerBlock",
22
- "MLPBlock",
23
- "LayerNorm2d",
24
19
  "AIFI",
20
+ "MLP",
25
21
  "DeformableTransformerDecoder",
26
22
  "DeformableTransformerDecoderLayer",
23
+ "LayerNorm2d",
24
+ "MLPBlock",
27
25
  "MSDeformAttn",
28
- "MLP",
26
+ "TransformerBlock",
27
+ "TransformerEncoderLayer",
28
+ "TransformerLayer",
29
29
  )
30
30
 
31
31
 
32
32
  class TransformerEncoderLayer(nn.Module):
33
- """
34
- A single layer of the transformer encoder.
33
+ """A single layer of the transformer encoder.
35
34
 
36
35
  This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
37
36
  supporting both pre-normalization and post-normalization configurations.
@@ -58,8 +57,7 @@ class TransformerEncoderLayer(nn.Module):
58
57
  act: nn.Module = nn.GELU(),
59
58
  normalize_before: bool = False,
60
59
  ):
61
- """
62
- Initialize the TransformerEncoderLayer with specified parameters.
60
+ """Initialize the TransformerEncoderLayer with specified parameters.
63
61
 
64
62
  Args:
65
63
  c1 (int): Input dimension.
@@ -102,8 +100,7 @@ class TransformerEncoderLayer(nn.Module):
102
100
  src_key_padding_mask: torch.Tensor | None = None,
103
101
  pos: torch.Tensor | None = None,
104
102
  ) -> torch.Tensor:
105
- """
106
- Perform forward pass with post-normalization.
103
+ """Perform forward pass with post-normalization.
107
104
 
108
105
  Args:
109
106
  src (torch.Tensor): Input tensor.
@@ -129,8 +126,7 @@ class TransformerEncoderLayer(nn.Module):
129
126
  src_key_padding_mask: torch.Tensor | None = None,
130
127
  pos: torch.Tensor | None = None,
131
128
  ) -> torch.Tensor:
132
- """
133
- Perform forward pass with pre-normalization.
129
+ """Perform forward pass with pre-normalization.
134
130
 
135
131
  Args:
136
132
  src (torch.Tensor): Input tensor.
@@ -156,8 +152,7 @@ class TransformerEncoderLayer(nn.Module):
156
152
  src_key_padding_mask: torch.Tensor | None = None,
157
153
  pos: torch.Tensor | None = None,
158
154
  ) -> torch.Tensor:
159
- """
160
- Forward propagate the input through the encoder module.
155
+ """Forward propagate the input through the encoder module.
161
156
 
162
157
  Args:
163
158
  src (torch.Tensor): Input tensor.
@@ -174,8 +169,7 @@ class TransformerEncoderLayer(nn.Module):
174
169
 
175
170
 
176
171
  class AIFI(TransformerEncoderLayer):
177
- """
178
- AIFI transformer layer for 2D data with positional embeddings.
172
+ """AIFI transformer layer for 2D data with positional embeddings.
179
173
 
180
174
  This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
181
175
  embeddings and handling the spatial dimensions appropriately.
@@ -190,8 +184,7 @@ class AIFI(TransformerEncoderLayer):
190
184
  act: nn.Module = nn.GELU(),
191
185
  normalize_before: bool = False,
192
186
  ):
193
- """
194
- Initialize the AIFI instance with specified parameters.
187
+ """Initialize the AIFI instance with specified parameters.
195
188
 
196
189
  Args:
197
190
  c1 (int): Input dimension.
@@ -204,8 +197,7 @@ class AIFI(TransformerEncoderLayer):
204
197
  super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
205
198
 
206
199
  def forward(self, x: torch.Tensor) -> torch.Tensor:
207
- """
208
- Forward pass for the AIFI transformer layer.
200
+ """Forward pass for the AIFI transformer layer.
209
201
 
210
202
  Args:
211
203
  x (torch.Tensor): Input tensor with shape [B, C, H, W].
@@ -223,8 +215,7 @@ class AIFI(TransformerEncoderLayer):
223
215
  def build_2d_sincos_position_embedding(
224
216
  w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
225
217
  ) -> torch.Tensor:
226
- """
227
- Build 2D sine-cosine position embedding.
218
+ """Build 2D sine-cosine position embedding.
228
219
 
229
220
  Args:
230
221
  w (int): Width of the feature map.
@@ -253,8 +244,7 @@ class TransformerLayer(nn.Module):
253
244
  """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
254
245
 
255
246
  def __init__(self, c: int, num_heads: int):
256
- """
257
- Initialize a self-attention mechanism using linear transformations and multi-head attention.
247
+ """Initialize a self-attention mechanism using linear transformations and multi-head attention.
258
248
 
259
249
  Args:
260
250
  c (int): Input and output channel dimension.
@@ -269,8 +259,7 @@ class TransformerLayer(nn.Module):
269
259
  self.fc2 = nn.Linear(c, c, bias=False)
270
260
 
271
261
  def forward(self, x: torch.Tensor) -> torch.Tensor:
272
- """
273
- Apply a transformer block to the input x and return the output.
262
+ """Apply a transformer block to the input x and return the output.
274
263
 
275
264
  Args:
276
265
  x (torch.Tensor): Input tensor.
@@ -283,11 +272,10 @@ class TransformerLayer(nn.Module):
283
272
 
284
273
 
285
274
  class TransformerBlock(nn.Module):
286
- """
287
- Vision Transformer block based on https://arxiv.org/abs/2010.11929.
275
+ """Vision Transformer block based on https://arxiv.org/abs/2010.11929.
288
276
 
289
- This class implements a complete transformer block with optional convolution layer for channel adjustment,
290
- learnable position embedding, and multiple transformer layers.
277
+ This class implements a complete transformer block with optional convolution layer for channel adjustment, learnable
278
+ position embedding, and multiple transformer layers.
291
279
 
292
280
  Attributes:
293
281
  conv (Conv, optional): Convolution layer if input and output channels differ.
@@ -297,8 +285,7 @@ class TransformerBlock(nn.Module):
297
285
  """
298
286
 
299
287
  def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
300
- """
301
- Initialize a Transformer module with position embedding and specified number of heads and layers.
288
+ """Initialize a Transformer module with position embedding and specified number of heads and layers.
302
289
 
303
290
  Args:
304
291
  c1 (int): Input channel dimension.
@@ -315,28 +302,26 @@ class TransformerBlock(nn.Module):
315
302
  self.c2 = c2
316
303
 
317
304
  def forward(self, x: torch.Tensor) -> torch.Tensor:
318
- """
319
- Forward propagate the input through the transformer block.
305
+ """Forward propagate the input through the transformer block.
320
306
 
321
307
  Args:
322
- x (torch.Tensor): Input tensor with shape [b, c1, w, h].
308
+ x (torch.Tensor): Input tensor with shape [b, c1, h, w].
323
309
 
324
310
  Returns:
325
- (torch.Tensor): Output tensor with shape [b, c2, w, h].
311
+ (torch.Tensor): Output tensor with shape [b, c2, h, w].
326
312
  """
327
313
  if self.conv is not None:
328
314
  x = self.conv(x)
329
- b, _, w, h = x.shape
315
+ b, _, h, w = x.shape
330
316
  p = x.flatten(2).permute(2, 0, 1)
331
- return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
317
+ return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, h, w)
332
318
 
333
319
 
334
320
  class MLPBlock(nn.Module):
335
321
  """A single block of a multi-layer perceptron."""
336
322
 
337
323
  def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
338
- """
339
- Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
324
+ """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
340
325
 
341
326
  Args:
342
327
  embedding_dim (int): Input and output dimension.
@@ -349,8 +334,7 @@ class MLPBlock(nn.Module):
349
334
  self.act = act()
350
335
 
351
336
  def forward(self, x: torch.Tensor) -> torch.Tensor:
352
- """
353
- Forward pass for the MLPBlock.
337
+ """Forward pass for the MLPBlock.
354
338
 
355
339
  Args:
356
340
  x (torch.Tensor): Input tensor.
@@ -362,11 +346,10 @@ class MLPBlock(nn.Module):
362
346
 
363
347
 
364
348
  class MLP(nn.Module):
365
- """
366
- A simple multi-layer perceptron (also called FFN).
349
+ """A simple multi-layer perceptron (also called FFN).
367
350
 
368
- This class implements a configurable MLP with multiple linear layers, activation functions, and optional
369
- sigmoid output activation.
351
+ This class implements a configurable MLP with multiple linear layers, activation functions, and optional sigmoid
352
+ output activation.
370
353
 
371
354
  Attributes:
372
355
  num_layers (int): Number of layers in the MLP.
@@ -376,10 +359,17 @@ class MLP(nn.Module):
376
359
  """
377
360
 
378
361
  def __init__(
379
- self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
362
+ self,
363
+ input_dim: int,
364
+ hidden_dim: int,
365
+ output_dim: int,
366
+ num_layers: int,
367
+ act=nn.ReLU,
368
+ sigmoid: bool = False,
369
+ residual: bool = False,
370
+ out_norm: nn.Module = None,
380
371
  ):
381
- """
382
- Initialize the MLP with specified input, hidden, output dimensions and number of layers.
372
+ """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
383
373
 
384
374
  Args:
385
375
  input_dim (int): Input dimension.
@@ -388,17 +378,24 @@ class MLP(nn.Module):
388
378
  num_layers (int): Number of layers.
389
379
  act (nn.Module): Activation function.
390
380
  sigmoid (bool): Whether to apply sigmoid to the output.
381
+ residual (bool): Whether to use residual connections.
382
+ out_norm (nn.Module, optional): Normalization layer for the output.
391
383
  """
392
384
  super().__init__()
393
385
  self.num_layers = num_layers
394
386
  h = [hidden_dim] * (num_layers - 1)
395
- self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
387
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
396
388
  self.sigmoid = sigmoid
397
389
  self.act = act()
390
+ if residual and input_dim != output_dim:
391
+ raise ValueError("residual is only supported if input_dim == output_dim")
392
+ self.residual = residual
393
+ # whether to apply a normalization layer to the output
394
+ assert isinstance(out_norm, nn.Module) or out_norm is None
395
+ self.out_norm = out_norm or nn.Identity()
398
396
 
399
397
  def forward(self, x: torch.Tensor) -> torch.Tensor:
400
- """
401
- Forward pass for the entire MLP.
398
+ """Forward pass for the entire MLP.
402
399
 
403
400
  Args:
404
401
  x (torch.Tensor): Input tensor.
@@ -406,17 +403,20 @@ class MLP(nn.Module):
406
403
  Returns:
407
404
  (torch.Tensor): Output tensor after MLP.
408
405
  """
406
+ orig_x = x
409
407
  for i, layer in enumerate(self.layers):
410
408
  x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
409
+ if getattr(self, "residual", False):
410
+ x = x + orig_x
411
+ x = getattr(self, "out_norm", nn.Identity())(x)
411
412
  return x.sigmoid() if getattr(self, "sigmoid", False) else x
412
413
 
413
414
 
414
415
  class LayerNorm2d(nn.Module):
415
- """
416
- 2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
416
+ """2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
417
417
 
418
- This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
419
- while preserving spatial dimensions.
418
+ This class implements layer normalization for 2D feature maps, normalizing across the channel dimension while
419
+ preserving spatial dimensions.
420
420
 
421
421
  Attributes:
422
422
  weight (nn.Parameter): Learnable scale parameter.
@@ -429,8 +429,7 @@ class LayerNorm2d(nn.Module):
429
429
  """
430
430
 
431
431
  def __init__(self, num_channels: int, eps: float = 1e-6):
432
- """
433
- Initialize LayerNorm2d with the given parameters.
432
+ """Initialize LayerNorm2d with the given parameters.
434
433
 
435
434
  Args:
436
435
  num_channels (int): Number of channels in the input.
@@ -442,8 +441,7 @@ class LayerNorm2d(nn.Module):
442
441
  self.eps = eps
443
442
 
444
443
  def forward(self, x: torch.Tensor) -> torch.Tensor:
445
- """
446
- Perform forward pass for 2D layer normalization.
444
+ """Perform forward pass for 2D layer normalization.
447
445
 
448
446
  Args:
449
447
  x (torch.Tensor): Input tensor.
@@ -458,11 +456,10 @@ class LayerNorm2d(nn.Module):
458
456
 
459
457
 
460
458
  class MSDeformAttn(nn.Module):
461
- """
462
- Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
459
+ """Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
463
460
 
464
- This module implements multiscale deformable attention that can attend to features at multiple scales
465
- with learnable sampling locations and attention weights.
461
+ This module implements multiscale deformable attention that can attend to features at multiple scales with learnable
462
+ sampling locations and attention weights.
466
463
 
467
464
  Attributes:
468
465
  im2col_step (int): Step size for im2col operations.
@@ -480,8 +477,7 @@ class MSDeformAttn(nn.Module):
480
477
  """
481
478
 
482
479
  def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
483
- """
484
- Initialize MSDeformAttn with the given parameters.
480
+ """Initialize MSDeformAttn with the given parameters.
485
481
 
486
482
  Args:
487
483
  d_model (int): Model dimension.
@@ -539,13 +535,12 @@ class MSDeformAttn(nn.Module):
539
535
  value_shapes: list,
540
536
  value_mask: torch.Tensor | None = None,
541
537
  ) -> torch.Tensor:
542
- """
543
- Perform forward pass for multiscale deformable attention.
538
+ """Perform forward pass for multiscale deformable attention.
544
539
 
545
540
  Args:
546
541
  query (torch.Tensor): Query tensor with shape [bs, query_length, C].
547
- refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
548
- range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
542
+ refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2], range in [0,
543
+ 1], top-left (0,0), bottom-right (1, 1), including padding area.
549
544
  value (torch.Tensor): Value tensor with shape [bs, value_length, C].
550
545
  value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
551
546
  value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
@@ -584,8 +579,7 @@ class MSDeformAttn(nn.Module):
584
579
 
585
580
 
586
581
  class DeformableTransformerDecoderLayer(nn.Module):
587
- """
588
- Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
582
+ """Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
589
583
 
590
584
  This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
591
585
  attention, and a feedforward network.
@@ -619,8 +613,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
619
613
  n_levels: int = 4,
620
614
  n_points: int = 4,
621
615
  ):
622
- """
623
- Initialize the DeformableTransformerDecoderLayer with the given parameters.
616
+ """Initialize the DeformableTransformerDecoderLayer with the given parameters.
624
617
 
625
618
  Args:
626
619
  d_model (int): Model dimension.
@@ -657,8 +650,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
657
650
  return tensor if pos is None else tensor + pos
658
651
 
659
652
  def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
660
- """
661
- Perform forward pass through the Feed-Forward Network part of the layer.
653
+ """Perform forward pass through the Feed-Forward Network part of the layer.
662
654
 
663
655
  Args:
664
656
  tgt (torch.Tensor): Input tensor.
@@ -680,8 +672,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
680
672
  attn_mask: torch.Tensor | None = None,
681
673
  query_pos: torch.Tensor | None = None,
682
674
  ) -> torch.Tensor:
683
- """
684
- Perform the forward pass through the entire decoder layer.
675
+ """Perform the forward pass through the entire decoder layer.
685
676
 
686
677
  Args:
687
678
  embed (torch.Tensor): Input embeddings.
@@ -715,11 +706,10 @@ class DeformableTransformerDecoderLayer(nn.Module):
715
706
 
716
707
 
717
708
  class DeformableTransformerDecoder(nn.Module):
718
- """
719
- Deformable Transformer Decoder based on PaddleDetection implementation.
709
+ """Deformable Transformer Decoder based on PaddleDetection implementation.
720
710
 
721
- This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
722
- heads for bounding box regression and classification.
711
+ This class implements a complete deformable transformer decoder with multiple decoder layers and prediction heads
712
+ for bounding box regression and classification.
723
713
 
724
714
  Attributes:
725
715
  layers (nn.ModuleList): List of decoder layers.
@@ -732,8 +722,7 @@ class DeformableTransformerDecoder(nn.Module):
732
722
  """
733
723
 
734
724
  def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
735
- """
736
- Initialize the DeformableTransformerDecoder with the given parameters.
725
+ """Initialize the DeformableTransformerDecoder with the given parameters.
737
726
 
738
727
  Args:
739
728
  hidden_dim (int): Hidden dimension.
@@ -759,8 +748,7 @@ class DeformableTransformerDecoder(nn.Module):
759
748
  attn_mask: torch.Tensor | None = None,
760
749
  padding_mask: torch.Tensor | None = None,
761
750
  ):
762
- """
763
- Perform the forward pass through the entire decoder.
751
+ """Perform the forward pass through the entire decoder.
764
752
 
765
753
  Args:
766
754
  embed (torch.Tensor): Decoder embeddings.
@@ -9,12 +9,11 @@ import torch.nn as nn
9
9
  import torch.nn.functional as F
10
10
  from torch.nn.init import uniform_
11
11
 
12
- __all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
12
+ __all__ = "inverse_sigmoid", "multi_scale_deformable_attn_pytorch"
13
13
 
14
14
 
15
15
  def _get_clones(module, n):
16
- """
17
- Create a list of cloned modules from the given module.
16
+ """Create a list of cloned modules from the given module.
18
17
 
19
18
  Args:
20
19
  module (nn.Module): The module to be cloned.
@@ -34,12 +33,11 @@ def _get_clones(module, n):
34
33
 
35
34
 
36
35
  def bias_init_with_prob(prior_prob=0.01):
37
- """
38
- Initialize conv/fc bias value according to a given probability value.
36
+ """Initialize conv/fc bias value according to a given probability value.
39
37
 
40
- This function calculates the bias initialization value based on a prior probability using the inverse error function.
41
- It's commonly used in object detection models to initialize classification layers with a specific positive prediction
42
- probability.
38
+ This function calculates the bias initialization value based on a prior probability using the inverse error
39
+ function. It's commonly used in object detection models to initialize classification layers with a specific positive
40
+ prediction probability.
43
41
 
44
42
  Args:
45
43
  prior_prob (float, optional): Prior probability for bias initialization.
@@ -56,11 +54,10 @@ def bias_init_with_prob(prior_prob=0.01):
56
54
 
57
55
 
58
56
  def linear_init(module):
59
- """
60
- Initialize the weights and biases of a linear module.
57
+ """Initialize the weights and biases of a linear module.
61
58
 
62
- This function initializes the weights of a linear module using a uniform distribution within bounds calculated
63
- from the input dimension. If the module has a bias, it is also initialized.
59
+ This function initializes the weights of a linear module using a uniform distribution within bounds calculated from
60
+ the input dimension. If the module has a bias, it is also initialized.
64
61
 
65
62
  Args:
66
63
  module (nn.Module): Linear module to initialize.
@@ -80,8 +77,7 @@ def linear_init(module):
80
77
 
81
78
 
82
79
  def inverse_sigmoid(x, eps=1e-5):
83
- """
84
- Calculate the inverse sigmoid function for a tensor.
80
+ """Calculate the inverse sigmoid function for a tensor.
85
81
 
86
82
  This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
87
83
  operations, particularly in attention mechanisms and coordinate transformations.
@@ -110,8 +106,7 @@ def multi_scale_deformable_attn_pytorch(
110
106
  sampling_locations: torch.Tensor,
111
107
  attention_weights: torch.Tensor,
112
108
  ) -> torch.Tensor:
113
- """
114
- Implement multi-scale deformable attention in PyTorch.
109
+ """Implement multi-scale deformable attention in PyTorch.
115
110
 
116
111
  This function performs deformable attention across multiple feature map scales, allowing the model to attend to
117
112
  different spatial locations with learned offsets.
@@ -119,10 +114,10 @@ def multi_scale_deformable_attn_pytorch(
119
114
  Args:
120
115
  value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
121
116
  value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
122
- sampling_locations (torch.Tensor): The sampling locations with shape
123
- (bs, num_queries, num_heads, num_levels, num_points, 2).
124
- attention_weights (torch.Tensor): The attention weights with shape
125
- (bs, num_queries, num_heads, num_levels, num_points).
117
+ sampling_locations (torch.Tensor): The sampling locations with shape (bs, num_queries, num_heads, num_levels,
118
+ num_points, 2).
119
+ attention_weights (torch.Tensor): The attention weights with shape (bs, num_queries, num_heads, num_levels,
120
+ num_points).
126
121
 
127
122
  Returns:
128
123
  (torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
@@ -152,7 +147,7 @@ def multi_scale_deformable_attn_pytorch(
152
147
  sampling_value_list.append(sampling_value_l_)
153
148
  # (bs, num_queries, num_heads, num_levels, num_points) ->
154
149
  # (bs, num_heads, num_queries, num_levels, num_points) ->
155
- # (bs, num_heads, 1, num_queries, num_levels*num_points)
150
+ # (bs*num_heads, 1, num_queries, num_levels*num_points)
156
151
  attention_weights = attention_weights.transpose(1, 2).reshape(
157
152
  bs * num_heads, 1, num_queries, num_levels * num_points
158
153
  )