dgenerate-ultralytics-headless 8.3.143__py3-none-any.whl → 8.3.145__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. {dgenerate_ultralytics_headless-8.3.143.dist-info → dgenerate_ultralytics_headless-8.3.145.dist-info}/METADATA +2 -2
  2. dgenerate_ultralytics_headless-8.3.145.dist-info/RECORD +272 -0
  3. tests/conftest.py +7 -24
  4. tests/test_cli.py +1 -1
  5. tests/test_cuda.py +7 -2
  6. tests/test_engine.py +7 -8
  7. tests/test_exports.py +16 -16
  8. tests/test_integrations.py +1 -1
  9. tests/test_solutions.py +11 -11
  10. ultralytics/__init__.py +1 -1
  11. ultralytics/cfg/__init__.py +16 -13
  12. ultralytics/data/annotator.py +6 -5
  13. ultralytics/data/augment.py +127 -126
  14. ultralytics/data/base.py +54 -51
  15. ultralytics/data/build.py +47 -23
  16. ultralytics/data/converter.py +47 -43
  17. ultralytics/data/dataset.py +51 -50
  18. ultralytics/data/loaders.py +77 -44
  19. ultralytics/data/split.py +22 -9
  20. ultralytics/data/split_dota.py +63 -39
  21. ultralytics/data/utils.py +59 -39
  22. ultralytics/engine/exporter.py +79 -27
  23. ultralytics/engine/model.py +52 -51
  24. ultralytics/engine/predictor.py +37 -28
  25. ultralytics/engine/results.py +191 -161
  26. ultralytics/engine/trainer.py +36 -19
  27. ultralytics/engine/tuner.py +12 -9
  28. ultralytics/engine/validator.py +7 -9
  29. ultralytics/hub/__init__.py +11 -13
  30. ultralytics/hub/auth.py +22 -2
  31. ultralytics/hub/google/__init__.py +19 -19
  32. ultralytics/hub/session.py +37 -51
  33. ultralytics/hub/utils.py +19 -5
  34. ultralytics/models/fastsam/model.py +30 -12
  35. ultralytics/models/fastsam/predict.py +5 -6
  36. ultralytics/models/fastsam/utils.py +3 -3
  37. ultralytics/models/fastsam/val.py +10 -6
  38. ultralytics/models/nas/model.py +9 -5
  39. ultralytics/models/nas/predict.py +6 -6
  40. ultralytics/models/nas/val.py +3 -3
  41. ultralytics/models/rtdetr/model.py +7 -6
  42. ultralytics/models/rtdetr/predict.py +14 -7
  43. ultralytics/models/rtdetr/train.py +10 -4
  44. ultralytics/models/rtdetr/val.py +36 -9
  45. ultralytics/models/sam/amg.py +30 -12
  46. ultralytics/models/sam/build.py +22 -22
  47. ultralytics/models/sam/model.py +10 -9
  48. ultralytics/models/sam/modules/blocks.py +76 -80
  49. ultralytics/models/sam/modules/decoders.py +6 -8
  50. ultralytics/models/sam/modules/encoders.py +23 -26
  51. ultralytics/models/sam/modules/memory_attention.py +13 -1
  52. ultralytics/models/sam/modules/sam.py +57 -26
  53. ultralytics/models/sam/modules/tiny_encoder.py +232 -237
  54. ultralytics/models/sam/modules/transformer.py +13 -13
  55. ultralytics/models/sam/modules/utils.py +11 -19
  56. ultralytics/models/sam/predict.py +114 -101
  57. ultralytics/models/utils/loss.py +98 -77
  58. ultralytics/models/utils/ops.py +116 -67
  59. ultralytics/models/yolo/classify/predict.py +5 -5
  60. ultralytics/models/yolo/classify/train.py +32 -28
  61. ultralytics/models/yolo/classify/val.py +7 -8
  62. ultralytics/models/yolo/detect/predict.py +1 -0
  63. ultralytics/models/yolo/detect/train.py +15 -14
  64. ultralytics/models/yolo/detect/val.py +37 -36
  65. ultralytics/models/yolo/model.py +106 -23
  66. ultralytics/models/yolo/obb/predict.py +3 -4
  67. ultralytics/models/yolo/obb/train.py +14 -6
  68. ultralytics/models/yolo/obb/val.py +29 -23
  69. ultralytics/models/yolo/pose/predict.py +9 -8
  70. ultralytics/models/yolo/pose/train.py +24 -16
  71. ultralytics/models/yolo/pose/val.py +44 -26
  72. ultralytics/models/yolo/segment/predict.py +5 -5
  73. ultralytics/models/yolo/segment/train.py +11 -7
  74. ultralytics/models/yolo/segment/val.py +2 -2
  75. ultralytics/models/yolo/world/train.py +33 -23
  76. ultralytics/models/yolo/world/train_world.py +11 -3
  77. ultralytics/models/yolo/yoloe/predict.py +11 -11
  78. ultralytics/models/yolo/yoloe/train.py +73 -21
  79. ultralytics/models/yolo/yoloe/train_seg.py +10 -7
  80. ultralytics/models/yolo/yoloe/val.py +42 -18
  81. ultralytics/nn/autobackend.py +59 -15
  82. ultralytics/nn/modules/__init__.py +4 -4
  83. ultralytics/nn/modules/activation.py +4 -1
  84. ultralytics/nn/modules/block.py +178 -111
  85. ultralytics/nn/modules/conv.py +6 -5
  86. ultralytics/nn/modules/head.py +469 -121
  87. ultralytics/nn/modules/transformer.py +147 -58
  88. ultralytics/nn/tasks.py +227 -20
  89. ultralytics/nn/text_model.py +30 -33
  90. ultralytics/solutions/ai_gym.py +4 -6
  91. ultralytics/solutions/analytics.py +7 -4
  92. ultralytics/solutions/config.py +10 -10
  93. ultralytics/solutions/distance_calculation.py +11 -10
  94. ultralytics/solutions/heatmap.py +2 -2
  95. ultralytics/solutions/instance_segmentation.py +7 -4
  96. ultralytics/solutions/object_blurrer.py +3 -3
  97. ultralytics/solutions/object_counter.py +15 -11
  98. ultralytics/solutions/object_cropper.py +3 -2
  99. ultralytics/solutions/parking_management.py +29 -28
  100. ultralytics/solutions/queue_management.py +6 -6
  101. ultralytics/solutions/region_counter.py +10 -3
  102. ultralytics/solutions/security_alarm.py +3 -3
  103. ultralytics/solutions/similarity_search.py +85 -24
  104. ultralytics/solutions/solutions.py +189 -79
  105. ultralytics/solutions/speed_estimation.py +28 -22
  106. ultralytics/solutions/streamlit_inference.py +17 -12
  107. ultralytics/solutions/trackzone.py +4 -4
  108. ultralytics/trackers/basetrack.py +16 -23
  109. ultralytics/trackers/bot_sort.py +30 -20
  110. ultralytics/trackers/byte_tracker.py +70 -64
  111. ultralytics/trackers/track.py +4 -8
  112. ultralytics/trackers/utils/gmc.py +31 -58
  113. ultralytics/trackers/utils/kalman_filter.py +37 -37
  114. ultralytics/trackers/utils/matching.py +1 -1
  115. ultralytics/utils/__init__.py +105 -89
  116. ultralytics/utils/autobatch.py +16 -3
  117. ultralytics/utils/autodevice.py +54 -24
  118. ultralytics/utils/benchmarks.py +45 -29
  119. ultralytics/utils/callbacks/base.py +3 -3
  120. ultralytics/utils/callbacks/clearml.py +9 -9
  121. ultralytics/utils/callbacks/comet.py +67 -25
  122. ultralytics/utils/callbacks/dvc.py +7 -10
  123. ultralytics/utils/callbacks/mlflow.py +2 -5
  124. ultralytics/utils/callbacks/neptune.py +7 -13
  125. ultralytics/utils/callbacks/raytune.py +1 -1
  126. ultralytics/utils/callbacks/tensorboard.py +5 -6
  127. ultralytics/utils/callbacks/wb.py +14 -14
  128. ultralytics/utils/checks.py +14 -13
  129. ultralytics/utils/dist.py +5 -5
  130. ultralytics/utils/downloads.py +94 -67
  131. ultralytics/utils/errors.py +5 -5
  132. ultralytics/utils/export.py +61 -47
  133. ultralytics/utils/files.py +23 -22
  134. ultralytics/utils/instance.py +48 -52
  135. ultralytics/utils/loss.py +78 -40
  136. ultralytics/utils/metrics.py +186 -130
  137. ultralytics/utils/ops.py +186 -190
  138. ultralytics/utils/patches.py +15 -17
  139. ultralytics/utils/plotting.py +71 -27
  140. ultralytics/utils/tal.py +21 -15
  141. ultralytics/utils/torch_utils.py +53 -50
  142. ultralytics/utils/triton.py +5 -4
  143. ultralytics/utils/tuner.py +5 -5
  144. dgenerate_ultralytics_headless-8.3.143.dist-info/RECORD +0 -272
  145. {dgenerate_ultralytics_headless-8.3.143.dist-info → dgenerate_ultralytics_headless-8.3.145.dist-info}/WHEEL +0 -0
  146. {dgenerate_ultralytics_headless-8.3.143.dist-info → dgenerate_ultralytics_headless-8.3.145.dist-info}/entry_points.txt +0 -0
  147. {dgenerate_ultralytics_headless-8.3.143.dist-info → dgenerate_ultralytics_headless-8.3.145.dist-info}/licenses/LICENSE +0 -0
  148. {dgenerate_ultralytics_headless-8.3.143.dist-info → dgenerate_ultralytics_headless-8.3.145.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  """Transformer modules."""
3
3
 
4
4
  import math
5
+ from typing import List, Optional
5
6
 
6
7
  import torch
7
8
  import torch.nn as nn
@@ -27,7 +28,10 @@ __all__ = (
27
28
 
28
29
  class TransformerEncoderLayer(nn.Module):
29
30
  """
30
- Defines a single layer of the transformer encoder.
31
+ A single layer of the transformer encoder.
32
+
33
+ This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
34
+ supporting both pre-normalization and post-normalization configurations.
31
35
 
32
36
  Attributes:
33
37
  ma (nn.MultiheadAttention): Multi-head attention module.
@@ -42,7 +46,15 @@ class TransformerEncoderLayer(nn.Module):
42
46
  normalize_before (bool): Whether to apply normalization before attention and feedforward.
43
47
  """
44
48
 
45
- def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
49
+ def __init__(
50
+ self,
51
+ c1: int,
52
+ cm: int = 2048,
53
+ num_heads: int = 8,
54
+ dropout: float = 0.0,
55
+ act: nn.Module = nn.GELU(),
56
+ normalize_before: bool = False,
57
+ ):
46
58
  """
47
59
  Initialize the TransformerEncoderLayer with specified parameters.
48
60
 
@@ -76,11 +88,17 @@ class TransformerEncoderLayer(nn.Module):
76
88
  self.normalize_before = normalize_before
77
89
 
78
90
  @staticmethod
79
- def with_pos_embed(tensor, pos=None):
91
+ def with_pos_embed(tensor: torch.Tensor, pos: Optional[torch.Tensor] = None) -> torch.Tensor:
80
92
  """Add position embeddings to the tensor if provided."""
81
93
  return tensor if pos is None else tensor + pos
82
94
 
83
- def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
95
+ def forward_post(
96
+ self,
97
+ src: torch.Tensor,
98
+ src_mask: Optional[torch.Tensor] = None,
99
+ src_key_padding_mask: Optional[torch.Tensor] = None,
100
+ pos: Optional[torch.Tensor] = None,
101
+ ) -> torch.Tensor:
84
102
  """
85
103
  Perform forward pass with post-normalization.
86
104
 
@@ -101,7 +119,13 @@ class TransformerEncoderLayer(nn.Module):
101
119
  src = src + self.dropout2(src2)
102
120
  return self.norm2(src)
103
121
 
104
- def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
122
+ def forward_pre(
123
+ self,
124
+ src: torch.Tensor,
125
+ src_mask: Optional[torch.Tensor] = None,
126
+ src_key_padding_mask: Optional[torch.Tensor] = None,
127
+ pos: Optional[torch.Tensor] = None,
128
+ ) -> torch.Tensor:
105
129
  """
106
130
  Perform forward pass with pre-normalization.
107
131
 
@@ -122,9 +146,15 @@ class TransformerEncoderLayer(nn.Module):
122
146
  src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
123
147
  return src + self.dropout2(src2)
124
148
 
125
- def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
149
+ def forward(
150
+ self,
151
+ src: torch.Tensor,
152
+ src_mask: Optional[torch.Tensor] = None,
153
+ src_key_padding_mask: Optional[torch.Tensor] = None,
154
+ pos: Optional[torch.Tensor] = None,
155
+ ) -> torch.Tensor:
126
156
  """
127
- Forward propagates the input through the encoder module.
157
+ Forward propagate the input through the encoder module.
128
158
 
129
159
  Args:
130
160
  src (torch.Tensor): Input tensor.
@@ -142,12 +172,21 @@ class TransformerEncoderLayer(nn.Module):
142
172
 
143
173
  class AIFI(TransformerEncoderLayer):
144
174
  """
145
- Defines the AIFI transformer layer.
175
+ AIFI transformer layer for 2D data with positional embeddings.
146
176
 
147
- This class extends TransformerEncoderLayer to work with 2D data by adding positional embeddings.
177
+ This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
178
+ embeddings and handling the spatial dimensions appropriately.
148
179
  """
149
180
 
150
- def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
181
+ def __init__(
182
+ self,
183
+ c1: int,
184
+ cm: int = 2048,
185
+ num_heads: int = 8,
186
+ dropout: float = 0,
187
+ act: nn.Module = nn.GELU(),
188
+ normalize_before: bool = False,
189
+ ):
151
190
  """
152
191
  Initialize the AIFI instance with specified parameters.
153
192
 
@@ -161,7 +200,7 @@ class AIFI(TransformerEncoderLayer):
161
200
  """
162
201
  super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
163
202
 
164
- def forward(self, x):
203
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
165
204
  """
166
205
  Forward pass for the AIFI transformer layer.
167
206
 
@@ -178,7 +217,9 @@ class AIFI(TransformerEncoderLayer):
178
217
  return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
179
218
 
180
219
  @staticmethod
181
- def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
220
+ def build_2d_sincos_position_embedding(
221
+ w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
222
+ ) -> torch.Tensor:
182
223
  """
183
224
  Build 2D sine-cosine position embedding.
184
225
 
@@ -208,7 +249,7 @@ class AIFI(TransformerEncoderLayer):
208
249
  class TransformerLayer(nn.Module):
209
250
  """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
210
251
 
211
- def __init__(self, c, num_heads):
252
+ def __init__(self, c: int, num_heads: int):
212
253
  """
213
254
  Initialize a self-attention mechanism using linear transformations and multi-head attention.
214
255
 
@@ -224,7 +265,7 @@ class TransformerLayer(nn.Module):
224
265
  self.fc1 = nn.Linear(c, c, bias=False)
225
266
  self.fc2 = nn.Linear(c, c, bias=False)
226
267
 
227
- def forward(self, x):
268
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
228
269
  """
229
270
  Apply a transformer block to the input x and return the output.
230
271
 
@@ -240,7 +281,10 @@ class TransformerLayer(nn.Module):
240
281
 
241
282
  class TransformerBlock(nn.Module):
242
283
  """
243
- Vision Transformer https://arxiv.org/abs/2010.11929.
284
+ Vision Transformer block based on https://arxiv.org/abs/2010.11929.
285
+
286
+ This class implements a complete transformer block with optional convolution layer for channel adjustment,
287
+ learnable position embedding, and multiple transformer layers.
244
288
 
245
289
  Attributes:
246
290
  conv (Conv, optional): Convolution layer if input and output channels differ.
@@ -249,7 +293,7 @@ class TransformerBlock(nn.Module):
249
293
  c2 (int): Output channel dimension.
250
294
  """
251
295
 
252
- def __init__(self, c1, c2, num_heads, num_layers):
296
+ def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
253
297
  """
254
298
  Initialize a Transformer module with position embedding and specified number of heads and layers.
255
299
 
@@ -267,9 +311,9 @@ class TransformerBlock(nn.Module):
267
311
  self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
268
312
  self.c2 = c2
269
313
 
270
- def forward(self, x):
314
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
271
315
  """
272
- Forward propagates the input through the bottleneck module.
316
+ Forward propagate the input through the transformer block.
273
317
 
274
318
  Args:
275
319
  x (torch.Tensor): Input tensor with shape [b, c1, w, h].
@@ -285,9 +329,9 @@ class TransformerBlock(nn.Module):
285
329
 
286
330
 
287
331
  class MLPBlock(nn.Module):
288
- """Implements a single block of a multi-layer perceptron."""
332
+ """A single block of a multi-layer perceptron."""
289
333
 
290
- def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
334
+ def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
291
335
  """
292
336
  Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
293
337
 
@@ -316,7 +360,10 @@ class MLPBlock(nn.Module):
316
360
 
317
361
  class MLP(nn.Module):
318
362
  """
319
- Implements a simple multi-layer perceptron (also called FFN).
363
+ A simple multi-layer perceptron (also called FFN).
364
+
365
+ This class implements a configurable MLP with multiple linear layers, activation functions, and optional
366
+ sigmoid output activation.
320
367
 
321
368
  Attributes:
322
369
  num_layers (int): Number of layers in the MLP.
@@ -325,7 +372,9 @@ class MLP(nn.Module):
325
372
  act (nn.Module): Activation function.
326
373
  """
327
374
 
328
- def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act=nn.ReLU, sigmoid=False):
375
+ def __init__(
376
+ self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
377
+ ):
329
378
  """
330
379
  Initialize the MLP with specified input, hidden, output dimensions and number of layers.
331
380
 
@@ -344,7 +393,7 @@ class MLP(nn.Module):
344
393
  self.sigmoid = sigmoid
345
394
  self.act = act()
346
395
 
347
- def forward(self, x):
396
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
348
397
  """
349
398
  Forward pass for the entire MLP.
350
399
 
@@ -363,18 +412,20 @@ class LayerNorm2d(nn.Module):
363
412
  """
364
413
  2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
365
414
 
366
- Original implementations in
367
- https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
368
- and
369
- https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
415
+ This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
416
+ while preserving spatial dimensions.
370
417
 
371
418
  Attributes:
372
419
  weight (nn.Parameter): Learnable scale parameter.
373
420
  bias (nn.Parameter): Learnable bias parameter.
374
421
  eps (float): Small constant for numerical stability.
422
+
423
+ References:
424
+ https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
425
+ https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
375
426
  """
376
427
 
377
- def __init__(self, num_channels, eps=1e-6):
428
+ def __init__(self, num_channels: int, eps: float = 1e-6):
378
429
  """
379
430
  Initialize LayerNorm2d with the given parameters.
380
431
 
@@ -387,7 +438,7 @@ class LayerNorm2d(nn.Module):
387
438
  self.bias = nn.Parameter(torch.zeros(num_channels))
388
439
  self.eps = eps
389
440
 
390
- def forward(self, x):
441
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
391
442
  """
392
443
  Perform forward pass for 2D layer normalization.
393
444
 
@@ -407,7 +458,8 @@ class MSDeformAttn(nn.Module):
407
458
  """
408
459
  Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
409
460
 
410
- https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
461
+ This module implements multiscale deformable attention that can attend to features at multiple scales
462
+ with learnable sampling locations and attention weights.
411
463
 
412
464
  Attributes:
413
465
  im2col_step (int): Step size for im2col operations.
@@ -419,9 +471,12 @@ class MSDeformAttn(nn.Module):
419
471
  attention_weights (nn.Linear): Linear layer for generating attention weights.
420
472
  value_proj (nn.Linear): Linear layer for projecting values.
421
473
  output_proj (nn.Linear): Linear layer for projecting output.
474
+
475
+ References:
476
+ https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
422
477
  """
423
478
 
424
- def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
479
+ def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
425
480
  """
426
481
  Initialize MSDeformAttn with the given parameters.
427
482
 
@@ -473,23 +528,31 @@ class MSDeformAttn(nn.Module):
473
528
  xavier_uniform_(self.output_proj.weight.data)
474
529
  constant_(self.output_proj.bias.data, 0.0)
475
530
 
476
- def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
531
+ def forward(
532
+ self,
533
+ query: torch.Tensor,
534
+ refer_bbox: torch.Tensor,
535
+ value: torch.Tensor,
536
+ value_shapes: List,
537
+ value_mask: Optional[torch.Tensor] = None,
538
+ ) -> torch.Tensor:
477
539
  """
478
540
  Perform forward pass for multiscale deformable attention.
479
541
 
480
- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
481
-
482
542
  Args:
483
- query (torch.Tensor): Tensor with shape [bs, query_length, C].
484
- refer_bbox (torch.Tensor): Tensor with shape [bs, query_length, n_levels, 2], range in [0, 1],
485
- top-left (0,0), bottom-right (1, 1), including padding area.
486
- value (torch.Tensor): Tensor with shape [bs, value_length, C].
543
+ query (torch.Tensor): Query tensor with shape [bs, query_length, C].
544
+ refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
545
+ range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
546
+ value (torch.Tensor): Value tensor with shape [bs, value_length, C].
487
547
  value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
488
- value_mask (torch.Tensor, optional): Tensor with shape [bs, value_length], True for non-padding elements,
489
- False for padding elements.
548
+ value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
549
+ elements, False for padding elements.
490
550
 
491
551
  Returns:
492
552
  (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
553
+
554
+ References:
555
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
493
556
  """
494
557
  bs, len_q = query.shape[:2]
495
558
  len_v = value.shape[1]
@@ -521,8 +584,8 @@ class DeformableTransformerDecoderLayer(nn.Module):
521
584
  """
522
585
  Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
523
586
 
524
- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
525
- https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
587
+ This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
588
+ attention, and a feedforward network.
526
589
 
527
590
  Attributes:
528
591
  self_attn (nn.MultiheadAttention): Self-attention module.
@@ -537,9 +600,22 @@ class DeformableTransformerDecoderLayer(nn.Module):
537
600
  linear2 (nn.Linear): Second linear layer in the feedforward network.
538
601
  dropout4 (nn.Dropout): Dropout after the feedforward network.
539
602
  norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
603
+
604
+ References:
605
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
606
+ https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
540
607
  """
541
608
 
542
- def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
609
+ def __init__(
610
+ self,
611
+ d_model: int = 256,
612
+ n_heads: int = 8,
613
+ d_ffn: int = 1024,
614
+ dropout: float = 0.0,
615
+ act: nn.Module = nn.ReLU(),
616
+ n_levels: int = 4,
617
+ n_points: int = 4,
618
+ ):
543
619
  """
544
620
  Initialize the DeformableTransformerDecoderLayer with the given parameters.
545
621
 
@@ -573,11 +649,11 @@ class DeformableTransformerDecoderLayer(nn.Module):
573
649
  self.norm3 = nn.LayerNorm(d_model)
574
650
 
575
651
  @staticmethod
576
- def with_pos_embed(tensor, pos):
652
+ def with_pos_embed(tensor: torch.Tensor, pos: Optional[torch.Tensor]) -> torch.Tensor:
577
653
  """Add positional embeddings to the input tensor, if provided."""
578
654
  return tensor if pos is None else tensor + pos
579
655
 
580
- def forward_ffn(self, tgt):
656
+ def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
581
657
  """
582
658
  Perform forward pass through the Feed-Forward Network part of the layer.
583
659
 
@@ -591,7 +667,16 @@ class DeformableTransformerDecoderLayer(nn.Module):
591
667
  tgt = tgt + self.dropout4(tgt2)
592
668
  return self.norm3(tgt)
593
669
 
594
- def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
670
+ def forward(
671
+ self,
672
+ embed: torch.Tensor,
673
+ refer_bbox: torch.Tensor,
674
+ feats: torch.Tensor,
675
+ shapes: List,
676
+ padding_mask: Optional[torch.Tensor] = None,
677
+ attn_mask: Optional[torch.Tensor] = None,
678
+ query_pos: Optional[torch.Tensor] = None,
679
+ ) -> torch.Tensor:
595
680
  """
596
681
  Perform the forward pass through the entire decoder layer.
597
682
 
@@ -628,18 +713,22 @@ class DeformableTransformerDecoderLayer(nn.Module):
628
713
 
629
714
  class DeformableTransformerDecoder(nn.Module):
630
715
  """
631
- Implementation of Deformable Transformer Decoder based on PaddleDetection.
716
+ Deformable Transformer Decoder based on PaddleDetection implementation.
632
717
 
633
- https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
718
+ This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
719
+ heads for bounding box regression and classification.
634
720
 
635
721
  Attributes:
636
722
  layers (nn.ModuleList): List of decoder layers.
637
723
  num_layers (int): Number of decoder layers.
638
724
  hidden_dim (int): Hidden dimension.
639
725
  eval_idx (int): Index of the layer to use during evaluation.
726
+
727
+ References:
728
+ https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
640
729
  """
641
730
 
642
- def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
731
+ def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
643
732
  """
644
733
  Initialize the DeformableTransformerDecoder with the given parameters.
645
734
 
@@ -657,15 +746,15 @@ class DeformableTransformerDecoder(nn.Module):
657
746
 
658
747
  def forward(
659
748
  self,
660
- embed, # decoder embeddings
661
- refer_bbox, # anchor
662
- feats, # image features
663
- shapes, # feature shapes
664
- bbox_head,
665
- score_head,
666
- pos_mlp,
667
- attn_mask=None,
668
- padding_mask=None,
749
+ embed: torch.Tensor, # decoder embeddings
750
+ refer_bbox: torch.Tensor, # anchor
751
+ feats: torch.Tensor, # image features
752
+ shapes: List, # feature shapes
753
+ bbox_head: nn.Module,
754
+ score_head: nn.Module,
755
+ pos_mlp: nn.Module,
756
+ attn_mask: Optional[torch.Tensor] = None,
757
+ padding_mask: Optional[torch.Tensor] = None,
669
758
  ):
670
759
  """
671
760
  Perform the forward pass through the entire decoder.