ultralytics 8.3.143__py3-none-any.whl → 8.3.144__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/conftest.py +7 -24
- tests/test_cli.py +1 -1
- tests/test_cuda.py +7 -2
- tests/test_engine.py +7 -8
- tests/test_exports.py +16 -16
- tests/test_integrations.py +1 -1
- tests/test_solutions.py +11 -11
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +16 -13
- ultralytics/data/annotator.py +6 -5
- ultralytics/data/augment.py +127 -126
- ultralytics/data/base.py +54 -51
- ultralytics/data/build.py +47 -23
- ultralytics/data/converter.py +47 -43
- ultralytics/data/dataset.py +51 -50
- ultralytics/data/loaders.py +77 -44
- ultralytics/data/split.py +22 -9
- ultralytics/data/split_dota.py +63 -39
- ultralytics/data/utils.py +59 -39
- ultralytics/engine/exporter.py +79 -27
- ultralytics/engine/model.py +39 -39
- ultralytics/engine/predictor.py +37 -28
- ultralytics/engine/results.py +187 -157
- ultralytics/engine/trainer.py +36 -19
- ultralytics/engine/tuner.py +12 -9
- ultralytics/engine/validator.py +7 -9
- ultralytics/hub/__init__.py +11 -13
- ultralytics/hub/auth.py +22 -2
- ultralytics/hub/google/__init__.py +19 -19
- ultralytics/hub/session.py +37 -51
- ultralytics/hub/utils.py +19 -5
- ultralytics/models/fastsam/model.py +30 -12
- ultralytics/models/fastsam/predict.py +5 -6
- ultralytics/models/fastsam/utils.py +3 -3
- ultralytics/models/fastsam/val.py +10 -6
- ultralytics/models/nas/model.py +9 -5
- ultralytics/models/nas/predict.py +6 -6
- ultralytics/models/nas/val.py +3 -3
- ultralytics/models/rtdetr/model.py +7 -6
- ultralytics/models/rtdetr/predict.py +14 -7
- ultralytics/models/rtdetr/train.py +10 -4
- ultralytics/models/rtdetr/val.py +36 -9
- ultralytics/models/sam/amg.py +30 -12
- ultralytics/models/sam/build.py +22 -22
- ultralytics/models/sam/model.py +10 -9
- ultralytics/models/sam/modules/blocks.py +76 -80
- ultralytics/models/sam/modules/decoders.py +6 -8
- ultralytics/models/sam/modules/encoders.py +23 -26
- ultralytics/models/sam/modules/memory_attention.py +13 -1
- ultralytics/models/sam/modules/sam.py +57 -26
- ultralytics/models/sam/modules/tiny_encoder.py +232 -237
- ultralytics/models/sam/modules/transformer.py +13 -13
- ultralytics/models/sam/modules/utils.py +11 -19
- ultralytics/models/sam/predict.py +114 -101
- ultralytics/models/utils/loss.py +98 -77
- ultralytics/models/utils/ops.py +116 -67
- ultralytics/models/yolo/classify/predict.py +5 -5
- ultralytics/models/yolo/classify/train.py +32 -28
- ultralytics/models/yolo/classify/val.py +7 -8
- ultralytics/models/yolo/detect/predict.py +1 -0
- ultralytics/models/yolo/detect/train.py +15 -14
- ultralytics/models/yolo/detect/val.py +37 -36
- ultralytics/models/yolo/model.py +106 -23
- ultralytics/models/yolo/obb/predict.py +3 -4
- ultralytics/models/yolo/obb/train.py +14 -6
- ultralytics/models/yolo/obb/val.py +29 -23
- ultralytics/models/yolo/pose/predict.py +9 -8
- ultralytics/models/yolo/pose/train.py +24 -16
- ultralytics/models/yolo/pose/val.py +44 -26
- ultralytics/models/yolo/segment/predict.py +5 -5
- ultralytics/models/yolo/segment/train.py +11 -7
- ultralytics/models/yolo/segment/val.py +2 -2
- ultralytics/models/yolo/world/train.py +33 -23
- ultralytics/models/yolo/world/train_world.py +11 -3
- ultralytics/models/yolo/yoloe/predict.py +11 -11
- ultralytics/models/yolo/yoloe/train.py +73 -21
- ultralytics/models/yolo/yoloe/train_seg.py +10 -7
- ultralytics/models/yolo/yoloe/val.py +42 -18
- ultralytics/nn/autobackend.py +59 -15
- ultralytics/nn/modules/__init__.py +4 -4
- ultralytics/nn/modules/activation.py +4 -1
- ultralytics/nn/modules/block.py +178 -111
- ultralytics/nn/modules/conv.py +6 -5
- ultralytics/nn/modules/head.py +469 -121
- ultralytics/nn/modules/transformer.py +147 -58
- ultralytics/nn/tasks.py +227 -20
- ultralytics/nn/text_model.py +30 -33
- ultralytics/solutions/ai_gym.py +1 -1
- ultralytics/solutions/analytics.py +7 -4
- ultralytics/solutions/config.py +10 -10
- ultralytics/solutions/distance_calculation.py +11 -10
- ultralytics/solutions/heatmap.py +1 -1
- ultralytics/solutions/instance_segmentation.py +6 -3
- ultralytics/solutions/object_blurrer.py +3 -3
- ultralytics/solutions/object_counter.py +15 -7
- ultralytics/solutions/object_cropper.py +3 -2
- ultralytics/solutions/parking_management.py +29 -28
- ultralytics/solutions/queue_management.py +6 -6
- ultralytics/solutions/region_counter.py +10 -3
- ultralytics/solutions/security_alarm.py +3 -3
- ultralytics/solutions/similarity_search.py +85 -24
- ultralytics/solutions/solutions.py +184 -75
- ultralytics/solutions/speed_estimation.py +28 -22
- ultralytics/solutions/streamlit_inference.py +17 -12
- ultralytics/solutions/trackzone.py +4 -4
- ultralytics/trackers/basetrack.py +16 -23
- ultralytics/trackers/bot_sort.py +30 -20
- ultralytics/trackers/byte_tracker.py +70 -64
- ultralytics/trackers/track.py +4 -8
- ultralytics/trackers/utils/gmc.py +31 -58
- ultralytics/trackers/utils/kalman_filter.py +37 -37
- ultralytics/trackers/utils/matching.py +1 -1
- ultralytics/utils/__init__.py +105 -89
- ultralytics/utils/autobatch.py +16 -3
- ultralytics/utils/autodevice.py +54 -24
- ultralytics/utils/benchmarks.py +42 -28
- ultralytics/utils/callbacks/base.py +3 -3
- ultralytics/utils/callbacks/clearml.py +9 -9
- ultralytics/utils/callbacks/comet.py +67 -25
- ultralytics/utils/callbacks/dvc.py +7 -10
- ultralytics/utils/callbacks/mlflow.py +2 -5
- ultralytics/utils/callbacks/neptune.py +7 -13
- ultralytics/utils/callbacks/raytune.py +1 -1
- ultralytics/utils/callbacks/tensorboard.py +5 -6
- ultralytics/utils/callbacks/wb.py +14 -14
- ultralytics/utils/checks.py +14 -13
- ultralytics/utils/dist.py +5 -5
- ultralytics/utils/downloads.py +94 -67
- ultralytics/utils/errors.py +5 -5
- ultralytics/utils/export.py +61 -47
- ultralytics/utils/files.py +23 -22
- ultralytics/utils/instance.py +48 -52
- ultralytics/utils/loss.py +78 -40
- ultralytics/utils/metrics.py +186 -130
- ultralytics/utils/ops.py +186 -190
- ultralytics/utils/patches.py +15 -17
- ultralytics/utils/plotting.py +71 -27
- ultralytics/utils/tal.py +21 -15
- ultralytics/utils/torch_utils.py +53 -50
- ultralytics/utils/triton.py +5 -4
- ultralytics/utils/tuner.py +5 -5
- {ultralytics-8.3.143.dist-info → ultralytics-8.3.144.dist-info}/METADATA +1 -1
- ultralytics-8.3.144.dist-info/RECORD +272 -0
- ultralytics-8.3.143.dist-info/RECORD +0 -272
- {ultralytics-8.3.143.dist-info → ultralytics-8.3.144.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.143.dist-info → ultralytics-8.3.144.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.143.dist-info → ultralytics-8.3.144.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.143.dist-info → ultralytics-8.3.144.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
|
|
2
2
|
"""Transformer modules."""
|
3
3
|
|
4
4
|
import math
|
5
|
+
from typing import List, Optional
|
5
6
|
|
6
7
|
import torch
|
7
8
|
import torch.nn as nn
|
@@ -27,7 +28,10 @@ __all__ = (
|
|
27
28
|
|
28
29
|
class TransformerEncoderLayer(nn.Module):
|
29
30
|
"""
|
30
|
-
|
31
|
+
A single layer of the transformer encoder.
|
32
|
+
|
33
|
+
This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
|
34
|
+
supporting both pre-normalization and post-normalization configurations.
|
31
35
|
|
32
36
|
Attributes:
|
33
37
|
ma (nn.MultiheadAttention): Multi-head attention module.
|
@@ -42,7 +46,15 @@ class TransformerEncoderLayer(nn.Module):
|
|
42
46
|
normalize_before (bool): Whether to apply normalization before attention and feedforward.
|
43
47
|
"""
|
44
48
|
|
45
|
-
def __init__(
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
c1: int,
|
52
|
+
cm: int = 2048,
|
53
|
+
num_heads: int = 8,
|
54
|
+
dropout: float = 0.0,
|
55
|
+
act: nn.Module = nn.GELU(),
|
56
|
+
normalize_before: bool = False,
|
57
|
+
):
|
46
58
|
"""
|
47
59
|
Initialize the TransformerEncoderLayer with specified parameters.
|
48
60
|
|
@@ -76,11 +88,17 @@ class TransformerEncoderLayer(nn.Module):
|
|
76
88
|
self.normalize_before = normalize_before
|
77
89
|
|
78
90
|
@staticmethod
|
79
|
-
def with_pos_embed(tensor, pos=None):
|
91
|
+
def with_pos_embed(tensor: torch.Tensor, pos: Optional[torch.Tensor] = None) -> torch.Tensor:
|
80
92
|
"""Add position embeddings to the tensor if provided."""
|
81
93
|
return tensor if pos is None else tensor + pos
|
82
94
|
|
83
|
-
def forward_post(
|
95
|
+
def forward_post(
|
96
|
+
self,
|
97
|
+
src: torch.Tensor,
|
98
|
+
src_mask: Optional[torch.Tensor] = None,
|
99
|
+
src_key_padding_mask: Optional[torch.Tensor] = None,
|
100
|
+
pos: Optional[torch.Tensor] = None,
|
101
|
+
) -> torch.Tensor:
|
84
102
|
"""
|
85
103
|
Perform forward pass with post-normalization.
|
86
104
|
|
@@ -101,7 +119,13 @@ class TransformerEncoderLayer(nn.Module):
|
|
101
119
|
src = src + self.dropout2(src2)
|
102
120
|
return self.norm2(src)
|
103
121
|
|
104
|
-
def forward_pre(
|
122
|
+
def forward_pre(
|
123
|
+
self,
|
124
|
+
src: torch.Tensor,
|
125
|
+
src_mask: Optional[torch.Tensor] = None,
|
126
|
+
src_key_padding_mask: Optional[torch.Tensor] = None,
|
127
|
+
pos: Optional[torch.Tensor] = None,
|
128
|
+
) -> torch.Tensor:
|
105
129
|
"""
|
106
130
|
Perform forward pass with pre-normalization.
|
107
131
|
|
@@ -122,9 +146,15 @@ class TransformerEncoderLayer(nn.Module):
|
|
122
146
|
src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
|
123
147
|
return src + self.dropout2(src2)
|
124
148
|
|
125
|
-
def forward(
|
149
|
+
def forward(
|
150
|
+
self,
|
151
|
+
src: torch.Tensor,
|
152
|
+
src_mask: Optional[torch.Tensor] = None,
|
153
|
+
src_key_padding_mask: Optional[torch.Tensor] = None,
|
154
|
+
pos: Optional[torch.Tensor] = None,
|
155
|
+
) -> torch.Tensor:
|
126
156
|
"""
|
127
|
-
Forward
|
157
|
+
Forward propagate the input through the encoder module.
|
128
158
|
|
129
159
|
Args:
|
130
160
|
src (torch.Tensor): Input tensor.
|
@@ -142,12 +172,21 @@ class TransformerEncoderLayer(nn.Module):
|
|
142
172
|
|
143
173
|
class AIFI(TransformerEncoderLayer):
|
144
174
|
"""
|
145
|
-
|
175
|
+
AIFI transformer layer for 2D data with positional embeddings.
|
146
176
|
|
147
|
-
This class extends TransformerEncoderLayer to work with 2D
|
177
|
+
This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
|
178
|
+
embeddings and handling the spatial dimensions appropriately.
|
148
179
|
"""
|
149
180
|
|
150
|
-
def __init__(
|
181
|
+
def __init__(
|
182
|
+
self,
|
183
|
+
c1: int,
|
184
|
+
cm: int = 2048,
|
185
|
+
num_heads: int = 8,
|
186
|
+
dropout: float = 0,
|
187
|
+
act: nn.Module = nn.GELU(),
|
188
|
+
normalize_before: bool = False,
|
189
|
+
):
|
151
190
|
"""
|
152
191
|
Initialize the AIFI instance with specified parameters.
|
153
192
|
|
@@ -161,7 +200,7 @@ class AIFI(TransformerEncoderLayer):
|
|
161
200
|
"""
|
162
201
|
super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
|
163
202
|
|
164
|
-
def forward(self, x):
|
203
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
165
204
|
"""
|
166
205
|
Forward pass for the AIFI transformer layer.
|
167
206
|
|
@@ -178,7 +217,9 @@ class AIFI(TransformerEncoderLayer):
|
|
178
217
|
return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
|
179
218
|
|
180
219
|
@staticmethod
|
181
|
-
def build_2d_sincos_position_embedding(
|
220
|
+
def build_2d_sincos_position_embedding(
|
221
|
+
w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
|
222
|
+
) -> torch.Tensor:
|
182
223
|
"""
|
183
224
|
Build 2D sine-cosine position embedding.
|
184
225
|
|
@@ -208,7 +249,7 @@ class AIFI(TransformerEncoderLayer):
|
|
208
249
|
class TransformerLayer(nn.Module):
|
209
250
|
"""Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
|
210
251
|
|
211
|
-
def __init__(self, c, num_heads):
|
252
|
+
def __init__(self, c: int, num_heads: int):
|
212
253
|
"""
|
213
254
|
Initialize a self-attention mechanism using linear transformations and multi-head attention.
|
214
255
|
|
@@ -224,7 +265,7 @@ class TransformerLayer(nn.Module):
|
|
224
265
|
self.fc1 = nn.Linear(c, c, bias=False)
|
225
266
|
self.fc2 = nn.Linear(c, c, bias=False)
|
226
267
|
|
227
|
-
def forward(self, x):
|
268
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
228
269
|
"""
|
229
270
|
Apply a transformer block to the input x and return the output.
|
230
271
|
|
@@ -240,7 +281,10 @@ class TransformerLayer(nn.Module):
|
|
240
281
|
|
241
282
|
class TransformerBlock(nn.Module):
|
242
283
|
"""
|
243
|
-
Vision Transformer https://arxiv.org/abs/2010.11929.
|
284
|
+
Vision Transformer block based on https://arxiv.org/abs/2010.11929.
|
285
|
+
|
286
|
+
This class implements a complete transformer block with optional convolution layer for channel adjustment,
|
287
|
+
learnable position embedding, and multiple transformer layers.
|
244
288
|
|
245
289
|
Attributes:
|
246
290
|
conv (Conv, optional): Convolution layer if input and output channels differ.
|
@@ -249,7 +293,7 @@ class TransformerBlock(nn.Module):
|
|
249
293
|
c2 (int): Output channel dimension.
|
250
294
|
"""
|
251
295
|
|
252
|
-
def __init__(self, c1, c2, num_heads, num_layers):
|
296
|
+
def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
|
253
297
|
"""
|
254
298
|
Initialize a Transformer module with position embedding and specified number of heads and layers.
|
255
299
|
|
@@ -267,9 +311,9 @@ class TransformerBlock(nn.Module):
|
|
267
311
|
self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
|
268
312
|
self.c2 = c2
|
269
313
|
|
270
|
-
def forward(self, x):
|
314
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
271
315
|
"""
|
272
|
-
Forward
|
316
|
+
Forward propagate the input through the transformer block.
|
273
317
|
|
274
318
|
Args:
|
275
319
|
x (torch.Tensor): Input tensor with shape [b, c1, w, h].
|
@@ -285,9 +329,9 @@ class TransformerBlock(nn.Module):
|
|
285
329
|
|
286
330
|
|
287
331
|
class MLPBlock(nn.Module):
|
288
|
-
"""
|
332
|
+
"""A single block of a multi-layer perceptron."""
|
289
333
|
|
290
|
-
def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
|
334
|
+
def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
|
291
335
|
"""
|
292
336
|
Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
|
293
337
|
|
@@ -316,7 +360,10 @@ class MLPBlock(nn.Module):
|
|
316
360
|
|
317
361
|
class MLP(nn.Module):
|
318
362
|
"""
|
319
|
-
|
363
|
+
A simple multi-layer perceptron (also called FFN).
|
364
|
+
|
365
|
+
This class implements a configurable MLP with multiple linear layers, activation functions, and optional
|
366
|
+
sigmoid output activation.
|
320
367
|
|
321
368
|
Attributes:
|
322
369
|
num_layers (int): Number of layers in the MLP.
|
@@ -325,7 +372,9 @@ class MLP(nn.Module):
|
|
325
372
|
act (nn.Module): Activation function.
|
326
373
|
"""
|
327
374
|
|
328
|
-
def __init__(
|
375
|
+
def __init__(
|
376
|
+
self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
|
377
|
+
):
|
329
378
|
"""
|
330
379
|
Initialize the MLP with specified input, hidden, output dimensions and number of layers.
|
331
380
|
|
@@ -344,7 +393,7 @@ class MLP(nn.Module):
|
|
344
393
|
self.sigmoid = sigmoid
|
345
394
|
self.act = act()
|
346
395
|
|
347
|
-
def forward(self, x):
|
396
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
348
397
|
"""
|
349
398
|
Forward pass for the entire MLP.
|
350
399
|
|
@@ -363,18 +412,20 @@ class LayerNorm2d(nn.Module):
|
|
363
412
|
"""
|
364
413
|
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
365
414
|
|
366
|
-
|
367
|
-
|
368
|
-
and
|
369
|
-
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
|
415
|
+
This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
|
416
|
+
while preserving spatial dimensions.
|
370
417
|
|
371
418
|
Attributes:
|
372
419
|
weight (nn.Parameter): Learnable scale parameter.
|
373
420
|
bias (nn.Parameter): Learnable bias parameter.
|
374
421
|
eps (float): Small constant for numerical stability.
|
422
|
+
|
423
|
+
References:
|
424
|
+
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
|
425
|
+
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
|
375
426
|
"""
|
376
427
|
|
377
|
-
def __init__(self, num_channels, eps=1e-6):
|
428
|
+
def __init__(self, num_channels: int, eps: float = 1e-6):
|
378
429
|
"""
|
379
430
|
Initialize LayerNorm2d with the given parameters.
|
380
431
|
|
@@ -387,7 +438,7 @@ class LayerNorm2d(nn.Module):
|
|
387
438
|
self.bias = nn.Parameter(torch.zeros(num_channels))
|
388
439
|
self.eps = eps
|
389
440
|
|
390
|
-
def forward(self, x):
|
441
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
391
442
|
"""
|
392
443
|
Perform forward pass for 2D layer normalization.
|
393
444
|
|
@@ -407,7 +458,8 @@ class MSDeformAttn(nn.Module):
|
|
407
458
|
"""
|
408
459
|
Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
409
460
|
|
410
|
-
|
461
|
+
This module implements multiscale deformable attention that can attend to features at multiple scales
|
462
|
+
with learnable sampling locations and attention weights.
|
411
463
|
|
412
464
|
Attributes:
|
413
465
|
im2col_step (int): Step size for im2col operations.
|
@@ -419,9 +471,12 @@ class MSDeformAttn(nn.Module):
|
|
419
471
|
attention_weights (nn.Linear): Linear layer for generating attention weights.
|
420
472
|
value_proj (nn.Linear): Linear layer for projecting values.
|
421
473
|
output_proj (nn.Linear): Linear layer for projecting output.
|
474
|
+
|
475
|
+
References:
|
476
|
+
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
|
422
477
|
"""
|
423
478
|
|
424
|
-
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
|
479
|
+
def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
|
425
480
|
"""
|
426
481
|
Initialize MSDeformAttn with the given parameters.
|
427
482
|
|
@@ -473,23 +528,31 @@ class MSDeformAttn(nn.Module):
|
|
473
528
|
xavier_uniform_(self.output_proj.weight.data)
|
474
529
|
constant_(self.output_proj.bias.data, 0.0)
|
475
530
|
|
476
|
-
def forward(
|
531
|
+
def forward(
|
532
|
+
self,
|
533
|
+
query: torch.Tensor,
|
534
|
+
refer_bbox: torch.Tensor,
|
535
|
+
value: torch.Tensor,
|
536
|
+
value_shapes: List,
|
537
|
+
value_mask: Optional[torch.Tensor] = None,
|
538
|
+
) -> torch.Tensor:
|
477
539
|
"""
|
478
540
|
Perform forward pass for multiscale deformable attention.
|
479
541
|
|
480
|
-
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
481
|
-
|
482
542
|
Args:
|
483
|
-
query (torch.Tensor):
|
484
|
-
refer_bbox (torch.Tensor):
|
485
|
-
top-left (0,0), bottom-right (1, 1), including padding area.
|
486
|
-
value (torch.Tensor):
|
543
|
+
query (torch.Tensor): Query tensor with shape [bs, query_length, C].
|
544
|
+
refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
|
545
|
+
range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
|
546
|
+
value (torch.Tensor): Value tensor with shape [bs, value_length, C].
|
487
547
|
value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
|
488
|
-
value_mask (torch.Tensor, optional):
|
489
|
-
False for padding elements.
|
548
|
+
value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
|
549
|
+
elements, False for padding elements.
|
490
550
|
|
491
551
|
Returns:
|
492
552
|
(torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
|
553
|
+
|
554
|
+
References:
|
555
|
+
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
493
556
|
"""
|
494
557
|
bs, len_q = query.shape[:2]
|
495
558
|
len_v = value.shape[1]
|
@@ -521,8 +584,8 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
521
584
|
"""
|
522
585
|
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
|
523
586
|
|
524
|
-
|
525
|
-
|
587
|
+
This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
|
588
|
+
attention, and a feedforward network.
|
526
589
|
|
527
590
|
Attributes:
|
528
591
|
self_attn (nn.MultiheadAttention): Self-attention module.
|
@@ -537,9 +600,22 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
537
600
|
linear2 (nn.Linear): Second linear layer in the feedforward network.
|
538
601
|
dropout4 (nn.Dropout): Dropout after the feedforward network.
|
539
602
|
norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
|
603
|
+
|
604
|
+
References:
|
605
|
+
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
606
|
+
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
|
540
607
|
"""
|
541
608
|
|
542
|
-
def __init__(
|
609
|
+
def __init__(
|
610
|
+
self,
|
611
|
+
d_model: int = 256,
|
612
|
+
n_heads: int = 8,
|
613
|
+
d_ffn: int = 1024,
|
614
|
+
dropout: float = 0.0,
|
615
|
+
act: nn.Module = nn.ReLU(),
|
616
|
+
n_levels: int = 4,
|
617
|
+
n_points: int = 4,
|
618
|
+
):
|
543
619
|
"""
|
544
620
|
Initialize the DeformableTransformerDecoderLayer with the given parameters.
|
545
621
|
|
@@ -573,11 +649,11 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
573
649
|
self.norm3 = nn.LayerNorm(d_model)
|
574
650
|
|
575
651
|
@staticmethod
|
576
|
-
def with_pos_embed(tensor, pos):
|
652
|
+
def with_pos_embed(tensor: torch.Tensor, pos: Optional[torch.Tensor]) -> torch.Tensor:
|
577
653
|
"""Add positional embeddings to the input tensor, if provided."""
|
578
654
|
return tensor if pos is None else tensor + pos
|
579
655
|
|
580
|
-
def forward_ffn(self, tgt):
|
656
|
+
def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
|
581
657
|
"""
|
582
658
|
Perform forward pass through the Feed-Forward Network part of the layer.
|
583
659
|
|
@@ -591,7 +667,16 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
591
667
|
tgt = tgt + self.dropout4(tgt2)
|
592
668
|
return self.norm3(tgt)
|
593
669
|
|
594
|
-
def forward(
|
670
|
+
def forward(
|
671
|
+
self,
|
672
|
+
embed: torch.Tensor,
|
673
|
+
refer_bbox: torch.Tensor,
|
674
|
+
feats: torch.Tensor,
|
675
|
+
shapes: List,
|
676
|
+
padding_mask: Optional[torch.Tensor] = None,
|
677
|
+
attn_mask: Optional[torch.Tensor] = None,
|
678
|
+
query_pos: Optional[torch.Tensor] = None,
|
679
|
+
) -> torch.Tensor:
|
595
680
|
"""
|
596
681
|
Perform the forward pass through the entire decoder layer.
|
597
682
|
|
@@ -628,18 +713,22 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
628
713
|
|
629
714
|
class DeformableTransformerDecoder(nn.Module):
|
630
715
|
"""
|
631
|
-
|
716
|
+
Deformable Transformer Decoder based on PaddleDetection implementation.
|
632
717
|
|
633
|
-
|
718
|
+
This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
|
719
|
+
heads for bounding box regression and classification.
|
634
720
|
|
635
721
|
Attributes:
|
636
722
|
layers (nn.ModuleList): List of decoder layers.
|
637
723
|
num_layers (int): Number of decoder layers.
|
638
724
|
hidden_dim (int): Hidden dimension.
|
639
725
|
eval_idx (int): Index of the layer to use during evaluation.
|
726
|
+
|
727
|
+
References:
|
728
|
+
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
640
729
|
"""
|
641
730
|
|
642
|
-
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx
|
731
|
+
def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
|
643
732
|
"""
|
644
733
|
Initialize the DeformableTransformerDecoder with the given parameters.
|
645
734
|
|
@@ -657,15 +746,15 @@ class DeformableTransformerDecoder(nn.Module):
|
|
657
746
|
|
658
747
|
def forward(
|
659
748
|
self,
|
660
|
-
embed, # decoder embeddings
|
661
|
-
refer_bbox, # anchor
|
662
|
-
feats, # image features
|
663
|
-
shapes, # feature shapes
|
664
|
-
bbox_head,
|
665
|
-
score_head,
|
666
|
-
pos_mlp,
|
667
|
-
attn_mask=None,
|
668
|
-
padding_mask=None,
|
749
|
+
embed: torch.Tensor, # decoder embeddings
|
750
|
+
refer_bbox: torch.Tensor, # anchor
|
751
|
+
feats: torch.Tensor, # image features
|
752
|
+
shapes: List, # feature shapes
|
753
|
+
bbox_head: nn.Module,
|
754
|
+
score_head: nn.Module,
|
755
|
+
pos_mlp: nn.Module,
|
756
|
+
attn_mask: Optional[torch.Tensor] = None,
|
757
|
+
padding_mask: Optional[torch.Tensor] = None,
|
669
758
|
):
|
670
759
|
"""
|
671
760
|
Perform the forward pass through the entire decoder.
|