dgenerate-ultralytics-headless 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/METADATA +41 -34
  2. dgenerate_ultralytics_headless-8.3.224.dist-info/RECORD +285 -0
  3. {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/WHEEL +1 -1
  4. tests/__init__.py +7 -6
  5. tests/conftest.py +15 -39
  6. tests/test_cli.py +17 -17
  7. tests/test_cuda.py +17 -8
  8. tests/test_engine.py +36 -10
  9. tests/test_exports.py +98 -37
  10. tests/test_integrations.py +12 -15
  11. tests/test_python.py +126 -82
  12. tests/test_solutions.py +319 -135
  13. ultralytics/__init__.py +27 -9
  14. ultralytics/cfg/__init__.py +83 -87
  15. ultralytics/cfg/datasets/Argoverse.yaml +4 -4
  16. ultralytics/cfg/datasets/DOTAv1.5.yaml +2 -2
  17. ultralytics/cfg/datasets/DOTAv1.yaml +2 -2
  18. ultralytics/cfg/datasets/GlobalWheat2020.yaml +2 -2
  19. ultralytics/cfg/datasets/HomeObjects-3K.yaml +4 -5
  20. ultralytics/cfg/datasets/ImageNet.yaml +3 -3
  21. ultralytics/cfg/datasets/Objects365.yaml +24 -20
  22. ultralytics/cfg/datasets/SKU-110K.yaml +9 -9
  23. ultralytics/cfg/datasets/VOC.yaml +10 -13
  24. ultralytics/cfg/datasets/VisDrone.yaml +43 -33
  25. ultralytics/cfg/datasets/african-wildlife.yaml +5 -5
  26. ultralytics/cfg/datasets/brain-tumor.yaml +4 -5
  27. ultralytics/cfg/datasets/carparts-seg.yaml +5 -5
  28. ultralytics/cfg/datasets/coco-pose.yaml +26 -4
  29. ultralytics/cfg/datasets/coco.yaml +4 -4
  30. ultralytics/cfg/datasets/coco128-seg.yaml +2 -2
  31. ultralytics/cfg/datasets/coco128.yaml +2 -2
  32. ultralytics/cfg/datasets/coco8-grayscale.yaml +103 -0
  33. ultralytics/cfg/datasets/coco8-multispectral.yaml +2 -2
  34. ultralytics/cfg/datasets/coco8-pose.yaml +23 -2
  35. ultralytics/cfg/datasets/coco8-seg.yaml +2 -2
  36. ultralytics/cfg/datasets/coco8.yaml +2 -2
  37. ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
  38. ultralytics/cfg/datasets/crack-seg.yaml +5 -5
  39. ultralytics/cfg/datasets/dog-pose.yaml +32 -4
  40. ultralytics/cfg/datasets/dota8-multispectral.yaml +2 -2
  41. ultralytics/cfg/datasets/dota8.yaml +2 -2
  42. ultralytics/cfg/datasets/hand-keypoints.yaml +29 -4
  43. ultralytics/cfg/datasets/lvis.yaml +9 -9
  44. ultralytics/cfg/datasets/medical-pills.yaml +4 -5
  45. ultralytics/cfg/datasets/open-images-v7.yaml +7 -10
  46. ultralytics/cfg/datasets/package-seg.yaml +5 -5
  47. ultralytics/cfg/datasets/signature.yaml +4 -4
  48. ultralytics/cfg/datasets/tiger-pose.yaml +20 -4
  49. ultralytics/cfg/datasets/xView.yaml +5 -5
  50. ultralytics/cfg/default.yaml +96 -93
  51. ultralytics/cfg/trackers/botsort.yaml +16 -17
  52. ultralytics/cfg/trackers/bytetrack.yaml +9 -11
  53. ultralytics/data/__init__.py +4 -4
  54. ultralytics/data/annotator.py +12 -12
  55. ultralytics/data/augment.py +531 -564
  56. ultralytics/data/base.py +76 -81
  57. ultralytics/data/build.py +206 -42
  58. ultralytics/data/converter.py +179 -78
  59. ultralytics/data/dataset.py +121 -121
  60. ultralytics/data/loaders.py +114 -91
  61. ultralytics/data/split.py +28 -15
  62. ultralytics/data/split_dota.py +67 -48
  63. ultralytics/data/utils.py +110 -89
  64. ultralytics/engine/exporter.py +422 -460
  65. ultralytics/engine/model.py +224 -252
  66. ultralytics/engine/predictor.py +94 -89
  67. ultralytics/engine/results.py +345 -595
  68. ultralytics/engine/trainer.py +231 -134
  69. ultralytics/engine/tuner.py +279 -73
  70. ultralytics/engine/validator.py +53 -46
  71. ultralytics/hub/__init__.py +26 -28
  72. ultralytics/hub/auth.py +30 -16
  73. ultralytics/hub/google/__init__.py +34 -36
  74. ultralytics/hub/session.py +53 -77
  75. ultralytics/hub/utils.py +23 -109
  76. ultralytics/models/__init__.py +1 -1
  77. ultralytics/models/fastsam/__init__.py +1 -1
  78. ultralytics/models/fastsam/model.py +36 -18
  79. ultralytics/models/fastsam/predict.py +33 -44
  80. ultralytics/models/fastsam/utils.py +4 -5
  81. ultralytics/models/fastsam/val.py +12 -14
  82. ultralytics/models/nas/__init__.py +1 -1
  83. ultralytics/models/nas/model.py +16 -20
  84. ultralytics/models/nas/predict.py +12 -14
  85. ultralytics/models/nas/val.py +4 -5
  86. ultralytics/models/rtdetr/__init__.py +1 -1
  87. ultralytics/models/rtdetr/model.py +9 -9
  88. ultralytics/models/rtdetr/predict.py +22 -17
  89. ultralytics/models/rtdetr/train.py +20 -16
  90. ultralytics/models/rtdetr/val.py +79 -59
  91. ultralytics/models/sam/__init__.py +8 -2
  92. ultralytics/models/sam/amg.py +53 -38
  93. ultralytics/models/sam/build.py +29 -31
  94. ultralytics/models/sam/model.py +33 -38
  95. ultralytics/models/sam/modules/blocks.py +159 -182
  96. ultralytics/models/sam/modules/decoders.py +38 -47
  97. ultralytics/models/sam/modules/encoders.py +114 -133
  98. ultralytics/models/sam/modules/memory_attention.py +38 -31
  99. ultralytics/models/sam/modules/sam.py +114 -93
  100. ultralytics/models/sam/modules/tiny_encoder.py +268 -291
  101. ultralytics/models/sam/modules/transformer.py +59 -66
  102. ultralytics/models/sam/modules/utils.py +55 -72
  103. ultralytics/models/sam/predict.py +745 -341
  104. ultralytics/models/utils/loss.py +118 -107
  105. ultralytics/models/utils/ops.py +118 -71
  106. ultralytics/models/yolo/__init__.py +1 -1
  107. ultralytics/models/yolo/classify/predict.py +28 -26
  108. ultralytics/models/yolo/classify/train.py +50 -81
  109. ultralytics/models/yolo/classify/val.py +68 -61
  110. ultralytics/models/yolo/detect/predict.py +12 -15
  111. ultralytics/models/yolo/detect/train.py +56 -46
  112. ultralytics/models/yolo/detect/val.py +279 -223
  113. ultralytics/models/yolo/model.py +167 -86
  114. ultralytics/models/yolo/obb/predict.py +7 -11
  115. ultralytics/models/yolo/obb/train.py +23 -25
  116. ultralytics/models/yolo/obb/val.py +107 -99
  117. ultralytics/models/yolo/pose/__init__.py +1 -1
  118. ultralytics/models/yolo/pose/predict.py +12 -14
  119. ultralytics/models/yolo/pose/train.py +31 -69
  120. ultralytics/models/yolo/pose/val.py +119 -254
  121. ultralytics/models/yolo/segment/predict.py +21 -25
  122. ultralytics/models/yolo/segment/train.py +12 -66
  123. ultralytics/models/yolo/segment/val.py +126 -305
  124. ultralytics/models/yolo/world/train.py +53 -45
  125. ultralytics/models/yolo/world/train_world.py +51 -32
  126. ultralytics/models/yolo/yoloe/__init__.py +7 -7
  127. ultralytics/models/yolo/yoloe/predict.py +30 -37
  128. ultralytics/models/yolo/yoloe/train.py +89 -71
  129. ultralytics/models/yolo/yoloe/train_seg.py +15 -17
  130. ultralytics/models/yolo/yoloe/val.py +56 -41
  131. ultralytics/nn/__init__.py +9 -11
  132. ultralytics/nn/autobackend.py +179 -107
  133. ultralytics/nn/modules/__init__.py +67 -67
  134. ultralytics/nn/modules/activation.py +8 -7
  135. ultralytics/nn/modules/block.py +302 -323
  136. ultralytics/nn/modules/conv.py +61 -104
  137. ultralytics/nn/modules/head.py +488 -186
  138. ultralytics/nn/modules/transformer.py +183 -123
  139. ultralytics/nn/modules/utils.py +15 -20
  140. ultralytics/nn/tasks.py +327 -203
  141. ultralytics/nn/text_model.py +81 -65
  142. ultralytics/py.typed +1 -0
  143. ultralytics/solutions/__init__.py +12 -12
  144. ultralytics/solutions/ai_gym.py +19 -27
  145. ultralytics/solutions/analytics.py +36 -26
  146. ultralytics/solutions/config.py +29 -28
  147. ultralytics/solutions/distance_calculation.py +23 -24
  148. ultralytics/solutions/heatmap.py +17 -19
  149. ultralytics/solutions/instance_segmentation.py +21 -19
  150. ultralytics/solutions/object_blurrer.py +16 -17
  151. ultralytics/solutions/object_counter.py +48 -53
  152. ultralytics/solutions/object_cropper.py +22 -16
  153. ultralytics/solutions/parking_management.py +61 -58
  154. ultralytics/solutions/queue_management.py +19 -19
  155. ultralytics/solutions/region_counter.py +63 -50
  156. ultralytics/solutions/security_alarm.py +22 -25
  157. ultralytics/solutions/similarity_search.py +107 -60
  158. ultralytics/solutions/solutions.py +343 -262
  159. ultralytics/solutions/speed_estimation.py +35 -31
  160. ultralytics/solutions/streamlit_inference.py +104 -40
  161. ultralytics/solutions/templates/similarity-search.html +31 -24
  162. ultralytics/solutions/trackzone.py +24 -24
  163. ultralytics/solutions/vision_eye.py +11 -12
  164. ultralytics/trackers/__init__.py +1 -1
  165. ultralytics/trackers/basetrack.py +18 -27
  166. ultralytics/trackers/bot_sort.py +48 -39
  167. ultralytics/trackers/byte_tracker.py +94 -94
  168. ultralytics/trackers/track.py +7 -16
  169. ultralytics/trackers/utils/gmc.py +37 -69
  170. ultralytics/trackers/utils/kalman_filter.py +68 -76
  171. ultralytics/trackers/utils/matching.py +13 -17
  172. ultralytics/utils/__init__.py +251 -275
  173. ultralytics/utils/autobatch.py +19 -7
  174. ultralytics/utils/autodevice.py +68 -38
  175. ultralytics/utils/benchmarks.py +169 -130
  176. ultralytics/utils/callbacks/base.py +12 -13
  177. ultralytics/utils/callbacks/clearml.py +14 -15
  178. ultralytics/utils/callbacks/comet.py +139 -66
  179. ultralytics/utils/callbacks/dvc.py +19 -27
  180. ultralytics/utils/callbacks/hub.py +8 -6
  181. ultralytics/utils/callbacks/mlflow.py +6 -10
  182. ultralytics/utils/callbacks/neptune.py +11 -19
  183. ultralytics/utils/callbacks/platform.py +73 -0
  184. ultralytics/utils/callbacks/raytune.py +3 -4
  185. ultralytics/utils/callbacks/tensorboard.py +9 -12
  186. ultralytics/utils/callbacks/wb.py +33 -30
  187. ultralytics/utils/checks.py +163 -114
  188. ultralytics/utils/cpu.py +89 -0
  189. ultralytics/utils/dist.py +24 -20
  190. ultralytics/utils/downloads.py +176 -146
  191. ultralytics/utils/errors.py +11 -13
  192. ultralytics/utils/events.py +113 -0
  193. ultralytics/utils/export/__init__.py +7 -0
  194. ultralytics/utils/{export.py → export/engine.py} +81 -63
  195. ultralytics/utils/export/imx.py +294 -0
  196. ultralytics/utils/export/tensorflow.py +217 -0
  197. ultralytics/utils/files.py +33 -36
  198. ultralytics/utils/git.py +137 -0
  199. ultralytics/utils/instance.py +105 -120
  200. ultralytics/utils/logger.py +404 -0
  201. ultralytics/utils/loss.py +99 -61
  202. ultralytics/utils/metrics.py +649 -478
  203. ultralytics/utils/nms.py +337 -0
  204. ultralytics/utils/ops.py +263 -451
  205. ultralytics/utils/patches.py +70 -31
  206. ultralytics/utils/plotting.py +253 -223
  207. ultralytics/utils/tal.py +48 -61
  208. ultralytics/utils/torch_utils.py +244 -251
  209. ultralytics/utils/tqdm.py +438 -0
  210. ultralytics/utils/triton.py +22 -23
  211. ultralytics/utils/tuner.py +11 -10
  212. dgenerate_ultralytics_headless-8.3.137.dist-info/RECORD +0 -272
  213. {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/entry_points.txt +0 -0
  214. {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/licenses/LICENSE +0 -0
  215. {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
  """Transformer modules."""
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import math
5
7
 
6
8
  import torch
@@ -8,26 +10,30 @@ import torch.nn as nn
8
10
  import torch.nn.functional as F
9
11
  from torch.nn.init import constant_, xavier_uniform_
10
12
 
13
+ from ultralytics.utils.torch_utils import TORCH_1_11
14
+
11
15
  from .conv import Conv
12
16
  from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
13
17
 
14
18
  __all__ = (
15
- "TransformerEncoderLayer",
16
- "TransformerLayer",
17
- "TransformerBlock",
18
- "MLPBlock",
19
- "LayerNorm2d",
20
19
  "AIFI",
20
+ "MLP",
21
21
  "DeformableTransformerDecoder",
22
22
  "DeformableTransformerDecoderLayer",
23
+ "LayerNorm2d",
24
+ "MLPBlock",
23
25
  "MSDeformAttn",
24
- "MLP",
26
+ "TransformerBlock",
27
+ "TransformerEncoderLayer",
28
+ "TransformerLayer",
25
29
  )
26
30
 
27
31
 
28
32
  class TransformerEncoderLayer(nn.Module):
29
- """
30
- Defines a single layer of the transformer encoder.
33
+ """A single layer of the transformer encoder.
34
+
35
+ This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
36
+ supporting both pre-normalization and post-normalization configurations.
31
37
 
32
38
  Attributes:
33
39
  ma (nn.MultiheadAttention): Multi-head attention module.
@@ -42,9 +48,16 @@ class TransformerEncoderLayer(nn.Module):
42
48
  normalize_before (bool): Whether to apply normalization before attention and feedforward.
43
49
  """
44
50
 
45
- def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
46
- """
47
- Initialize the TransformerEncoderLayer with specified parameters.
51
+ def __init__(
52
+ self,
53
+ c1: int,
54
+ cm: int = 2048,
55
+ num_heads: int = 8,
56
+ dropout: float = 0.0,
57
+ act: nn.Module = nn.GELU(),
58
+ normalize_before: bool = False,
59
+ ):
60
+ """Initialize the TransformerEncoderLayer with specified parameters.
48
61
 
49
62
  Args:
50
63
  c1 (int): Input dimension.
@@ -76,13 +89,18 @@ class TransformerEncoderLayer(nn.Module):
76
89
  self.normalize_before = normalize_before
77
90
 
78
91
  @staticmethod
79
- def with_pos_embed(tensor, pos=None):
92
+ def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None = None) -> torch.Tensor:
80
93
  """Add position embeddings to the tensor if provided."""
81
94
  return tensor if pos is None else tensor + pos
82
95
 
83
- def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
84
- """
85
- Perform forward pass with post-normalization.
96
+ def forward_post(
97
+ self,
98
+ src: torch.Tensor,
99
+ src_mask: torch.Tensor | None = None,
100
+ src_key_padding_mask: torch.Tensor | None = None,
101
+ pos: torch.Tensor | None = None,
102
+ ) -> torch.Tensor:
103
+ """Perform forward pass with post-normalization.
86
104
 
87
105
  Args:
88
106
  src (torch.Tensor): Input tensor.
@@ -101,9 +119,14 @@ class TransformerEncoderLayer(nn.Module):
101
119
  src = src + self.dropout2(src2)
102
120
  return self.norm2(src)
103
121
 
104
- def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
105
- """
106
- Perform forward pass with pre-normalization.
122
+ def forward_pre(
123
+ self,
124
+ src: torch.Tensor,
125
+ src_mask: torch.Tensor | None = None,
126
+ src_key_padding_mask: torch.Tensor | None = None,
127
+ pos: torch.Tensor | None = None,
128
+ ) -> torch.Tensor:
129
+ """Perform forward pass with pre-normalization.
107
130
 
108
131
  Args:
109
132
  src (torch.Tensor): Input tensor.
@@ -122,9 +145,14 @@ class TransformerEncoderLayer(nn.Module):
122
145
  src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
123
146
  return src + self.dropout2(src2)
124
147
 
125
- def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
126
- """
127
- Forward propagates the input through the encoder module.
148
+ def forward(
149
+ self,
150
+ src: torch.Tensor,
151
+ src_mask: torch.Tensor | None = None,
152
+ src_key_padding_mask: torch.Tensor | None = None,
153
+ pos: torch.Tensor | None = None,
154
+ ) -> torch.Tensor:
155
+ """Forward propagate the input through the encoder module.
128
156
 
129
157
  Args:
130
158
  src (torch.Tensor): Input tensor.
@@ -141,15 +169,22 @@ class TransformerEncoderLayer(nn.Module):
141
169
 
142
170
 
143
171
  class AIFI(TransformerEncoderLayer):
144
- """
145
- Defines the AIFI transformer layer.
172
+ """AIFI transformer layer for 2D data with positional embeddings.
146
173
 
147
- This class extends TransformerEncoderLayer to work with 2D data by adding positional embeddings.
174
+ This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
175
+ embeddings and handling the spatial dimensions appropriately.
148
176
  """
149
177
 
150
- def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
151
- """
152
- Initialize the AIFI instance with specified parameters.
178
+ def __init__(
179
+ self,
180
+ c1: int,
181
+ cm: int = 2048,
182
+ num_heads: int = 8,
183
+ dropout: float = 0,
184
+ act: nn.Module = nn.GELU(),
185
+ normalize_before: bool = False,
186
+ ):
187
+ """Initialize the AIFI instance with specified parameters.
153
188
 
154
189
  Args:
155
190
  c1 (int): Input dimension.
@@ -161,9 +196,8 @@ class AIFI(TransformerEncoderLayer):
161
196
  """
162
197
  super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
163
198
 
164
- def forward(self, x):
165
- """
166
- Forward pass for the AIFI transformer layer.
199
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
200
+ """Forward pass for the AIFI transformer layer.
167
201
 
168
202
  Args:
169
203
  x (torch.Tensor): Input tensor with shape [B, C, H, W].
@@ -178,9 +212,10 @@ class AIFI(TransformerEncoderLayer):
178
212
  return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
179
213
 
180
214
  @staticmethod
181
- def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
182
- """
183
- Build 2D sine-cosine position embedding.
215
+ def build_2d_sincos_position_embedding(
216
+ w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
217
+ ) -> torch.Tensor:
218
+ """Build 2D sine-cosine position embedding.
184
219
 
185
220
  Args:
186
221
  w (int): Width of the feature map.
@@ -194,7 +229,7 @@ class AIFI(TransformerEncoderLayer):
194
229
  assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
195
230
  grid_w = torch.arange(w, dtype=torch.float32)
196
231
  grid_h = torch.arange(h, dtype=torch.float32)
197
- grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
232
+ grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") if TORCH_1_11 else torch.meshgrid(grid_w, grid_h)
198
233
  pos_dim = embed_dim // 4
199
234
  omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
200
235
  omega = 1.0 / (temperature**omega)
@@ -208,9 +243,8 @@ class AIFI(TransformerEncoderLayer):
208
243
  class TransformerLayer(nn.Module):
209
244
  """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
210
245
 
211
- def __init__(self, c, num_heads):
212
- """
213
- Initialize a self-attention mechanism using linear transformations and multi-head attention.
246
+ def __init__(self, c: int, num_heads: int):
247
+ """Initialize a self-attention mechanism using linear transformations and multi-head attention.
214
248
 
215
249
  Args:
216
250
  c (int): Input and output channel dimension.
@@ -224,9 +258,8 @@ class TransformerLayer(nn.Module):
224
258
  self.fc1 = nn.Linear(c, c, bias=False)
225
259
  self.fc2 = nn.Linear(c, c, bias=False)
226
260
 
227
- def forward(self, x):
228
- """
229
- Apply a transformer block to the input x and return the output.
261
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
262
+ """Apply a transformer block to the input x and return the output.
230
263
 
231
264
  Args:
232
265
  x (torch.Tensor): Input tensor.
@@ -239,8 +272,10 @@ class TransformerLayer(nn.Module):
239
272
 
240
273
 
241
274
  class TransformerBlock(nn.Module):
242
- """
243
- Vision Transformer https://arxiv.org/abs/2010.11929.
275
+ """Vision Transformer block based on https://arxiv.org/abs/2010.11929.
276
+
277
+ This class implements a complete transformer block with optional convolution layer for channel adjustment, learnable
278
+ position embedding, and multiple transformer layers.
244
279
 
245
280
  Attributes:
246
281
  conv (Conv, optional): Convolution layer if input and output channels differ.
@@ -249,9 +284,8 @@ class TransformerBlock(nn.Module):
249
284
  c2 (int): Output channel dimension.
250
285
  """
251
286
 
252
- def __init__(self, c1, c2, num_heads, num_layers):
253
- """
254
- Initialize a Transformer module with position embedding and specified number of heads and layers.
287
+ def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
288
+ """Initialize a Transformer module with position embedding and specified number of heads and layers.
255
289
 
256
290
  Args:
257
291
  c1 (int): Input channel dimension.
@@ -267,9 +301,8 @@ class TransformerBlock(nn.Module):
267
301
  self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
268
302
  self.c2 = c2
269
303
 
270
- def forward(self, x):
271
- """
272
- Forward propagates the input through the bottleneck module.
304
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
305
+ """Forward propagate the input through the transformer block.
273
306
 
274
307
  Args:
275
308
  x (torch.Tensor): Input tensor with shape [b, c1, w, h].
@@ -285,11 +318,10 @@ class TransformerBlock(nn.Module):
285
318
 
286
319
 
287
320
  class MLPBlock(nn.Module):
288
- """Implements a single block of a multi-layer perceptron."""
321
+ """A single block of a multi-layer perceptron."""
289
322
 
290
- def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
291
- """
292
- Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
323
+ def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
324
+ """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
293
325
 
294
326
  Args:
295
327
  embedding_dim (int): Input and output dimension.
@@ -302,8 +334,7 @@ class MLPBlock(nn.Module):
302
334
  self.act = act()
303
335
 
304
336
  def forward(self, x: torch.Tensor) -> torch.Tensor:
305
- """
306
- Forward pass for the MLPBlock.
337
+ """Forward pass for the MLPBlock.
307
338
 
308
339
  Args:
309
340
  x (torch.Tensor): Input tensor.
@@ -315,8 +346,10 @@ class MLPBlock(nn.Module):
315
346
 
316
347
 
317
348
  class MLP(nn.Module):
318
- """
319
- Implements a simple multi-layer perceptron (also called FFN).
349
+ """A simple multi-layer perceptron (also called FFN).
350
+
351
+ This class implements a configurable MLP with multiple linear layers, activation functions, and optional sigmoid
352
+ output activation.
320
353
 
321
354
  Attributes:
322
355
  num_layers (int): Number of layers in the MLP.
@@ -325,9 +358,10 @@ class MLP(nn.Module):
325
358
  act (nn.Module): Activation function.
326
359
  """
327
360
 
328
- def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act=nn.ReLU, sigmoid=False):
329
- """
330
- Initialize the MLP with specified input, hidden, output dimensions and number of layers.
361
+ def __init__(
362
+ self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
363
+ ):
364
+ """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
331
365
 
332
366
  Args:
333
367
  input_dim (int): Input dimension.
@@ -340,13 +374,12 @@ class MLP(nn.Module):
340
374
  super().__init__()
341
375
  self.num_layers = num_layers
342
376
  h = [hidden_dim] * (num_layers - 1)
343
- self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
377
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
344
378
  self.sigmoid = sigmoid
345
379
  self.act = act()
346
380
 
347
- def forward(self, x):
348
- """
349
- Forward pass for the entire MLP.
381
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
382
+ """Forward pass for the entire MLP.
350
383
 
351
384
  Args:
352
385
  x (torch.Tensor): Input tensor.
@@ -360,23 +393,23 @@ class MLP(nn.Module):
360
393
 
361
394
 
362
395
  class LayerNorm2d(nn.Module):
363
- """
364
- 2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
396
+ """2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
365
397
 
366
- Original implementations in
367
- https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
368
- and
369
- https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
398
+ This class implements layer normalization for 2D feature maps, normalizing across the channel dimension while
399
+ preserving spatial dimensions.
370
400
 
371
401
  Attributes:
372
402
  weight (nn.Parameter): Learnable scale parameter.
373
403
  bias (nn.Parameter): Learnable bias parameter.
374
404
  eps (float): Small constant for numerical stability.
405
+
406
+ References:
407
+ https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
408
+ https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
375
409
  """
376
410
 
377
- def __init__(self, num_channels, eps=1e-6):
378
- """
379
- Initialize LayerNorm2d with the given parameters.
411
+ def __init__(self, num_channels: int, eps: float = 1e-6):
412
+ """Initialize LayerNorm2d with the given parameters.
380
413
 
381
414
  Args:
382
415
  num_channels (int): Number of channels in the input.
@@ -387,9 +420,8 @@ class LayerNorm2d(nn.Module):
387
420
  self.bias = nn.Parameter(torch.zeros(num_channels))
388
421
  self.eps = eps
389
422
 
390
- def forward(self, x):
391
- """
392
- Perform forward pass for 2D layer normalization.
423
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
424
+ """Perform forward pass for 2D layer normalization.
393
425
 
394
426
  Args:
395
427
  x (torch.Tensor): Input tensor.
@@ -404,10 +436,10 @@ class LayerNorm2d(nn.Module):
404
436
 
405
437
 
406
438
  class MSDeformAttn(nn.Module):
407
- """
408
- Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
439
+ """Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
409
440
 
410
- https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
441
+ This module implements multiscale deformable attention that can attend to features at multiple scales with learnable
442
+ sampling locations and attention weights.
411
443
 
412
444
  Attributes:
413
445
  im2col_step (int): Step size for im2col operations.
@@ -419,11 +451,13 @@ class MSDeformAttn(nn.Module):
419
451
  attention_weights (nn.Linear): Linear layer for generating attention weights.
420
452
  value_proj (nn.Linear): Linear layer for projecting values.
421
453
  output_proj (nn.Linear): Linear layer for projecting output.
454
+
455
+ References:
456
+ https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
422
457
  """
423
458
 
424
- def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
425
- """
426
- Initialize MSDeformAttn with the given parameters.
459
+ def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
460
+ """Initialize MSDeformAttn with the given parameters.
427
461
 
428
462
  Args:
429
463
  d_model (int): Model dimension.
@@ -473,23 +507,30 @@ class MSDeformAttn(nn.Module):
473
507
  xavier_uniform_(self.output_proj.weight.data)
474
508
  constant_(self.output_proj.bias.data, 0.0)
475
509
 
476
- def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
477
- """
478
- Perform forward pass for multiscale deformable attention.
479
-
480
- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
510
+ def forward(
511
+ self,
512
+ query: torch.Tensor,
513
+ refer_bbox: torch.Tensor,
514
+ value: torch.Tensor,
515
+ value_shapes: list,
516
+ value_mask: torch.Tensor | None = None,
517
+ ) -> torch.Tensor:
518
+ """Perform forward pass for multiscale deformable attention.
481
519
 
482
520
  Args:
483
- query (torch.Tensor): Tensor with shape [bs, query_length, C].
484
- refer_bbox (torch.Tensor): Tensor with shape [bs, query_length, n_levels, 2], range in [0, 1],
485
- top-left (0,0), bottom-right (1, 1), including padding area.
486
- value (torch.Tensor): Tensor with shape [bs, value_length, C].
521
+ query (torch.Tensor): Query tensor with shape [bs, query_length, C].
522
+ refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2], range in [0,
523
+ 1], top-left (0,0), bottom-right (1, 1), including padding area.
524
+ value (torch.Tensor): Value tensor with shape [bs, value_length, C].
487
525
  value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
488
- value_mask (torch.Tensor, optional): Tensor with shape [bs, value_length], True for non-padding elements,
489
- False for padding elements.
526
+ value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
527
+ elements, False for padding elements.
490
528
 
491
529
  Returns:
492
530
  (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
531
+
532
+ References:
533
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
493
534
  """
494
535
  bs, len_q = query.shape[:2]
495
536
  len_v = value.shape[1]
@@ -518,11 +559,10 @@ class MSDeformAttn(nn.Module):
518
559
 
519
560
 
520
561
  class DeformableTransformerDecoderLayer(nn.Module):
521
- """
522
- Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
562
+ """Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
523
563
 
524
- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
525
- https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
564
+ This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
565
+ attention, and a feedforward network.
526
566
 
527
567
  Attributes:
528
568
  self_attn (nn.MultiheadAttention): Self-attention module.
@@ -537,11 +577,23 @@ class DeformableTransformerDecoderLayer(nn.Module):
537
577
  linear2 (nn.Linear): Second linear layer in the feedforward network.
538
578
  dropout4 (nn.Dropout): Dropout after the feedforward network.
539
579
  norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
580
+
581
+ References:
582
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
583
+ https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
540
584
  """
541
585
 
542
- def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
543
- """
544
- Initialize the DeformableTransformerDecoderLayer with the given parameters.
586
+ def __init__(
587
+ self,
588
+ d_model: int = 256,
589
+ n_heads: int = 8,
590
+ d_ffn: int = 1024,
591
+ dropout: float = 0.0,
592
+ act: nn.Module = nn.ReLU(),
593
+ n_levels: int = 4,
594
+ n_points: int = 4,
595
+ ):
596
+ """Initialize the DeformableTransformerDecoderLayer with the given parameters.
545
597
 
546
598
  Args:
547
599
  d_model (int): Model dimension.
@@ -573,13 +625,12 @@ class DeformableTransformerDecoderLayer(nn.Module):
573
625
  self.norm3 = nn.LayerNorm(d_model)
574
626
 
575
627
  @staticmethod
576
- def with_pos_embed(tensor, pos):
628
+ def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None) -> torch.Tensor:
577
629
  """Add positional embeddings to the input tensor, if provided."""
578
630
  return tensor if pos is None else tensor + pos
579
631
 
580
- def forward_ffn(self, tgt):
581
- """
582
- Perform forward pass through the Feed-Forward Network part of the layer.
632
+ def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
633
+ """Perform forward pass through the Feed-Forward Network part of the layer.
583
634
 
584
635
  Args:
585
636
  tgt (torch.Tensor): Input tensor.
@@ -591,9 +642,17 @@ class DeformableTransformerDecoderLayer(nn.Module):
591
642
  tgt = tgt + self.dropout4(tgt2)
592
643
  return self.norm3(tgt)
593
644
 
594
- def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
595
- """
596
- Perform the forward pass through the entire decoder layer.
645
+ def forward(
646
+ self,
647
+ embed: torch.Tensor,
648
+ refer_bbox: torch.Tensor,
649
+ feats: torch.Tensor,
650
+ shapes: list,
651
+ padding_mask: torch.Tensor | None = None,
652
+ attn_mask: torch.Tensor | None = None,
653
+ query_pos: torch.Tensor | None = None,
654
+ ) -> torch.Tensor:
655
+ """Perform the forward pass through the entire decoder layer.
597
656
 
598
657
  Args:
599
658
  embed (torch.Tensor): Input embeddings.
@@ -627,21 +686,23 @@ class DeformableTransformerDecoderLayer(nn.Module):
627
686
 
628
687
 
629
688
  class DeformableTransformerDecoder(nn.Module):
630
- """
631
- Implementation of Deformable Transformer Decoder based on PaddleDetection.
689
+ """Deformable Transformer Decoder based on PaddleDetection implementation.
632
690
 
633
- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
691
+ This class implements a complete deformable transformer decoder with multiple decoder layers and prediction heads
692
+ for bounding box regression and classification.
634
693
 
635
694
  Attributes:
636
695
  layers (nn.ModuleList): List of decoder layers.
637
696
  num_layers (int): Number of decoder layers.
638
697
  hidden_dim (int): Hidden dimension.
639
698
  eval_idx (int): Index of the layer to use during evaluation.
699
+
700
+ References:
701
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
640
702
  """
641
703
 
642
- def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
643
- """
644
- Initialize the DeformableTransformerDecoder with the given parameters.
704
+ def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
705
+ """Initialize the DeformableTransformerDecoder with the given parameters.
645
706
 
646
707
  Args:
647
708
  hidden_dim (int): Hidden dimension.
@@ -657,18 +718,17 @@ class DeformableTransformerDecoder(nn.Module):
657
718
 
658
719
  def forward(
659
720
  self,
660
- embed, # decoder embeddings
661
- refer_bbox, # anchor
662
- feats, # image features
663
- shapes, # feature shapes
664
- bbox_head,
665
- score_head,
666
- pos_mlp,
667
- attn_mask=None,
668
- padding_mask=None,
721
+ embed: torch.Tensor, # decoder embeddings
722
+ refer_bbox: torch.Tensor, # anchor
723
+ feats: torch.Tensor, # image features
724
+ shapes: list, # feature shapes
725
+ bbox_head: nn.Module,
726
+ score_head: nn.Module,
727
+ pos_mlp: nn.Module,
728
+ attn_mask: torch.Tensor | None = None,
729
+ padding_mask: torch.Tensor | None = None,
669
730
  ):
670
- """
671
- Perform the forward pass through the entire decoder.
731
+ """Perform the forward pass through the entire decoder.
672
732
 
673
733
  Args:
674
734
  embed (torch.Tensor): Decoder embeddings.
@@ -9,12 +9,11 @@ import torch.nn as nn
9
9
  import torch.nn.functional as F
10
10
  from torch.nn.init import uniform_
11
11
 
12
- __all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
12
+ __all__ = "inverse_sigmoid", "multi_scale_deformable_attn_pytorch"
13
13
 
14
14
 
15
15
  def _get_clones(module, n):
16
- """
17
- Create a list of cloned modules from the given module.
16
+ """Create a list of cloned modules from the given module.
18
17
 
19
18
  Args:
20
19
  module (nn.Module): The module to be cloned.
@@ -34,12 +33,11 @@ def _get_clones(module, n):
34
33
 
35
34
 
36
35
  def bias_init_with_prob(prior_prob=0.01):
37
- """
38
- Initialize conv/fc bias value according to a given probability value.
36
+ """Initialize conv/fc bias value according to a given probability value.
39
37
 
40
- This function calculates the bias initialization value based on a prior probability using the inverse error function.
41
- It's commonly used in object detection models to initialize classification layers with a specific positive prediction
42
- probability.
38
+ This function calculates the bias initialization value based on a prior probability using the inverse error
39
+ function. It's commonly used in object detection models to initialize classification layers with a specific positive
40
+ prediction probability.
43
41
 
44
42
  Args:
45
43
  prior_prob (float, optional): Prior probability for bias initialization.
@@ -56,11 +54,10 @@ def bias_init_with_prob(prior_prob=0.01):
56
54
 
57
55
 
58
56
  def linear_init(module):
59
- """
60
- Initialize the weights and biases of a linear module.
57
+ """Initialize the weights and biases of a linear module.
61
58
 
62
- This function initializes the weights of a linear module using a uniform distribution within bounds calculated
63
- from the input dimension. If the module has a bias, it is also initialized.
59
+ This function initializes the weights of a linear module using a uniform distribution within bounds calculated from
60
+ the input dimension. If the module has a bias, it is also initialized.
64
61
 
65
62
  Args:
66
63
  module (nn.Module): Linear module to initialize.
@@ -80,8 +77,7 @@ def linear_init(module):
80
77
 
81
78
 
82
79
  def inverse_sigmoid(x, eps=1e-5):
83
- """
84
- Calculate the inverse sigmoid function for a tensor.
80
+ """Calculate the inverse sigmoid function for a tensor.
85
81
 
86
82
  This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
87
83
  operations, particularly in attention mechanisms and coordinate transformations.
@@ -110,8 +106,7 @@ def multi_scale_deformable_attn_pytorch(
110
106
  sampling_locations: torch.Tensor,
111
107
  attention_weights: torch.Tensor,
112
108
  ) -> torch.Tensor:
113
- """
114
- Implement multi-scale deformable attention in PyTorch.
109
+ """Implement multi-scale deformable attention in PyTorch.
115
110
 
116
111
  This function performs deformable attention across multiple feature map scales, allowing the model to attend to
117
112
  different spatial locations with learned offsets.
@@ -119,10 +114,10 @@ def multi_scale_deformable_attn_pytorch(
119
114
  Args:
120
115
  value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
121
116
  value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
122
- sampling_locations (torch.Tensor): The sampling locations with shape
123
- (bs, num_queries, num_heads, num_levels, num_points, 2).
124
- attention_weights (torch.Tensor): The attention weights with shape
125
- (bs, num_queries, num_heads, num_levels, num_points).
117
+ sampling_locations (torch.Tensor): The sampling locations with shape (bs, num_queries, num_heads, num_levels,
118
+ num_points, 2).
119
+ attention_weights (torch.Tensor): The attention weights with shape (bs, num_queries, num_heads, num_levels,
120
+ num_points).
126
121
 
127
122
  Returns:
128
123
  (torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).