dgenerate-ultralytics-headless 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/METADATA +41 -34
- dgenerate_ultralytics_headless-8.3.224.dist-info/RECORD +285 -0
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/WHEEL +1 -1
- tests/__init__.py +7 -6
- tests/conftest.py +15 -39
- tests/test_cli.py +17 -17
- tests/test_cuda.py +17 -8
- tests/test_engine.py +36 -10
- tests/test_exports.py +98 -37
- tests/test_integrations.py +12 -15
- tests/test_python.py +126 -82
- tests/test_solutions.py +319 -135
- ultralytics/__init__.py +27 -9
- ultralytics/cfg/__init__.py +83 -87
- ultralytics/cfg/datasets/Argoverse.yaml +4 -4
- ultralytics/cfg/datasets/DOTAv1.5.yaml +2 -2
- ultralytics/cfg/datasets/DOTAv1.yaml +2 -2
- ultralytics/cfg/datasets/GlobalWheat2020.yaml +2 -2
- ultralytics/cfg/datasets/HomeObjects-3K.yaml +4 -5
- ultralytics/cfg/datasets/ImageNet.yaml +3 -3
- ultralytics/cfg/datasets/Objects365.yaml +24 -20
- ultralytics/cfg/datasets/SKU-110K.yaml +9 -9
- ultralytics/cfg/datasets/VOC.yaml +10 -13
- ultralytics/cfg/datasets/VisDrone.yaml +43 -33
- ultralytics/cfg/datasets/african-wildlife.yaml +5 -5
- ultralytics/cfg/datasets/brain-tumor.yaml +4 -5
- ultralytics/cfg/datasets/carparts-seg.yaml +5 -5
- ultralytics/cfg/datasets/coco-pose.yaml +26 -4
- ultralytics/cfg/datasets/coco.yaml +4 -4
- ultralytics/cfg/datasets/coco128-seg.yaml +2 -2
- ultralytics/cfg/datasets/coco128.yaml +2 -2
- ultralytics/cfg/datasets/coco8-grayscale.yaml +103 -0
- ultralytics/cfg/datasets/coco8-multispectral.yaml +2 -2
- ultralytics/cfg/datasets/coco8-pose.yaml +23 -2
- ultralytics/cfg/datasets/coco8-seg.yaml +2 -2
- ultralytics/cfg/datasets/coco8.yaml +2 -2
- ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
- ultralytics/cfg/datasets/crack-seg.yaml +5 -5
- ultralytics/cfg/datasets/dog-pose.yaml +32 -4
- ultralytics/cfg/datasets/dota8-multispectral.yaml +2 -2
- ultralytics/cfg/datasets/dota8.yaml +2 -2
- ultralytics/cfg/datasets/hand-keypoints.yaml +29 -4
- ultralytics/cfg/datasets/lvis.yaml +9 -9
- ultralytics/cfg/datasets/medical-pills.yaml +4 -5
- ultralytics/cfg/datasets/open-images-v7.yaml +7 -10
- ultralytics/cfg/datasets/package-seg.yaml +5 -5
- ultralytics/cfg/datasets/signature.yaml +4 -4
- ultralytics/cfg/datasets/tiger-pose.yaml +20 -4
- ultralytics/cfg/datasets/xView.yaml +5 -5
- ultralytics/cfg/default.yaml +96 -93
- ultralytics/cfg/trackers/botsort.yaml +16 -17
- ultralytics/cfg/trackers/bytetrack.yaml +9 -11
- ultralytics/data/__init__.py +4 -4
- ultralytics/data/annotator.py +12 -12
- ultralytics/data/augment.py +531 -564
- ultralytics/data/base.py +76 -81
- ultralytics/data/build.py +206 -42
- ultralytics/data/converter.py +179 -78
- ultralytics/data/dataset.py +121 -121
- ultralytics/data/loaders.py +114 -91
- ultralytics/data/split.py +28 -15
- ultralytics/data/split_dota.py +67 -48
- ultralytics/data/utils.py +110 -89
- ultralytics/engine/exporter.py +422 -460
- ultralytics/engine/model.py +224 -252
- ultralytics/engine/predictor.py +94 -89
- ultralytics/engine/results.py +345 -595
- ultralytics/engine/trainer.py +231 -134
- ultralytics/engine/tuner.py +279 -73
- ultralytics/engine/validator.py +53 -46
- ultralytics/hub/__init__.py +26 -28
- ultralytics/hub/auth.py +30 -16
- ultralytics/hub/google/__init__.py +34 -36
- ultralytics/hub/session.py +53 -77
- ultralytics/hub/utils.py +23 -109
- ultralytics/models/__init__.py +1 -1
- ultralytics/models/fastsam/__init__.py +1 -1
- ultralytics/models/fastsam/model.py +36 -18
- ultralytics/models/fastsam/predict.py +33 -44
- ultralytics/models/fastsam/utils.py +4 -5
- ultralytics/models/fastsam/val.py +12 -14
- ultralytics/models/nas/__init__.py +1 -1
- ultralytics/models/nas/model.py +16 -20
- ultralytics/models/nas/predict.py +12 -14
- ultralytics/models/nas/val.py +4 -5
- ultralytics/models/rtdetr/__init__.py +1 -1
- ultralytics/models/rtdetr/model.py +9 -9
- ultralytics/models/rtdetr/predict.py +22 -17
- ultralytics/models/rtdetr/train.py +20 -16
- ultralytics/models/rtdetr/val.py +79 -59
- ultralytics/models/sam/__init__.py +8 -2
- ultralytics/models/sam/amg.py +53 -38
- ultralytics/models/sam/build.py +29 -31
- ultralytics/models/sam/model.py +33 -38
- ultralytics/models/sam/modules/blocks.py +159 -182
- ultralytics/models/sam/modules/decoders.py +38 -47
- ultralytics/models/sam/modules/encoders.py +114 -133
- ultralytics/models/sam/modules/memory_attention.py +38 -31
- ultralytics/models/sam/modules/sam.py +114 -93
- ultralytics/models/sam/modules/tiny_encoder.py +268 -291
- ultralytics/models/sam/modules/transformer.py +59 -66
- ultralytics/models/sam/modules/utils.py +55 -72
- ultralytics/models/sam/predict.py +745 -341
- ultralytics/models/utils/loss.py +118 -107
- ultralytics/models/utils/ops.py +118 -71
- ultralytics/models/yolo/__init__.py +1 -1
- ultralytics/models/yolo/classify/predict.py +28 -26
- ultralytics/models/yolo/classify/train.py +50 -81
- ultralytics/models/yolo/classify/val.py +68 -61
- ultralytics/models/yolo/detect/predict.py +12 -15
- ultralytics/models/yolo/detect/train.py +56 -46
- ultralytics/models/yolo/detect/val.py +279 -223
- ultralytics/models/yolo/model.py +167 -86
- ultralytics/models/yolo/obb/predict.py +7 -11
- ultralytics/models/yolo/obb/train.py +23 -25
- ultralytics/models/yolo/obb/val.py +107 -99
- ultralytics/models/yolo/pose/__init__.py +1 -1
- ultralytics/models/yolo/pose/predict.py +12 -14
- ultralytics/models/yolo/pose/train.py +31 -69
- ultralytics/models/yolo/pose/val.py +119 -254
- ultralytics/models/yolo/segment/predict.py +21 -25
- ultralytics/models/yolo/segment/train.py +12 -66
- ultralytics/models/yolo/segment/val.py +126 -305
- ultralytics/models/yolo/world/train.py +53 -45
- ultralytics/models/yolo/world/train_world.py +51 -32
- ultralytics/models/yolo/yoloe/__init__.py +7 -7
- ultralytics/models/yolo/yoloe/predict.py +30 -37
- ultralytics/models/yolo/yoloe/train.py +89 -71
- ultralytics/models/yolo/yoloe/train_seg.py +15 -17
- ultralytics/models/yolo/yoloe/val.py +56 -41
- ultralytics/nn/__init__.py +9 -11
- ultralytics/nn/autobackend.py +179 -107
- ultralytics/nn/modules/__init__.py +67 -67
- ultralytics/nn/modules/activation.py +8 -7
- ultralytics/nn/modules/block.py +302 -323
- ultralytics/nn/modules/conv.py +61 -104
- ultralytics/nn/modules/head.py +488 -186
- ultralytics/nn/modules/transformer.py +183 -123
- ultralytics/nn/modules/utils.py +15 -20
- ultralytics/nn/tasks.py +327 -203
- ultralytics/nn/text_model.py +81 -65
- ultralytics/py.typed +1 -0
- ultralytics/solutions/__init__.py +12 -12
- ultralytics/solutions/ai_gym.py +19 -27
- ultralytics/solutions/analytics.py +36 -26
- ultralytics/solutions/config.py +29 -28
- ultralytics/solutions/distance_calculation.py +23 -24
- ultralytics/solutions/heatmap.py +17 -19
- ultralytics/solutions/instance_segmentation.py +21 -19
- ultralytics/solutions/object_blurrer.py +16 -17
- ultralytics/solutions/object_counter.py +48 -53
- ultralytics/solutions/object_cropper.py +22 -16
- ultralytics/solutions/parking_management.py +61 -58
- ultralytics/solutions/queue_management.py +19 -19
- ultralytics/solutions/region_counter.py +63 -50
- ultralytics/solutions/security_alarm.py +22 -25
- ultralytics/solutions/similarity_search.py +107 -60
- ultralytics/solutions/solutions.py +343 -262
- ultralytics/solutions/speed_estimation.py +35 -31
- ultralytics/solutions/streamlit_inference.py +104 -40
- ultralytics/solutions/templates/similarity-search.html +31 -24
- ultralytics/solutions/trackzone.py +24 -24
- ultralytics/solutions/vision_eye.py +11 -12
- ultralytics/trackers/__init__.py +1 -1
- ultralytics/trackers/basetrack.py +18 -27
- ultralytics/trackers/bot_sort.py +48 -39
- ultralytics/trackers/byte_tracker.py +94 -94
- ultralytics/trackers/track.py +7 -16
- ultralytics/trackers/utils/gmc.py +37 -69
- ultralytics/trackers/utils/kalman_filter.py +68 -76
- ultralytics/trackers/utils/matching.py +13 -17
- ultralytics/utils/__init__.py +251 -275
- ultralytics/utils/autobatch.py +19 -7
- ultralytics/utils/autodevice.py +68 -38
- ultralytics/utils/benchmarks.py +169 -130
- ultralytics/utils/callbacks/base.py +12 -13
- ultralytics/utils/callbacks/clearml.py +14 -15
- ultralytics/utils/callbacks/comet.py +139 -66
- ultralytics/utils/callbacks/dvc.py +19 -27
- ultralytics/utils/callbacks/hub.py +8 -6
- ultralytics/utils/callbacks/mlflow.py +6 -10
- ultralytics/utils/callbacks/neptune.py +11 -19
- ultralytics/utils/callbacks/platform.py +73 -0
- ultralytics/utils/callbacks/raytune.py +3 -4
- ultralytics/utils/callbacks/tensorboard.py +9 -12
- ultralytics/utils/callbacks/wb.py +33 -30
- ultralytics/utils/checks.py +163 -114
- ultralytics/utils/cpu.py +89 -0
- ultralytics/utils/dist.py +24 -20
- ultralytics/utils/downloads.py +176 -146
- ultralytics/utils/errors.py +11 -13
- ultralytics/utils/events.py +113 -0
- ultralytics/utils/export/__init__.py +7 -0
- ultralytics/utils/{export.py → export/engine.py} +81 -63
- ultralytics/utils/export/imx.py +294 -0
- ultralytics/utils/export/tensorflow.py +217 -0
- ultralytics/utils/files.py +33 -36
- ultralytics/utils/git.py +137 -0
- ultralytics/utils/instance.py +105 -120
- ultralytics/utils/logger.py +404 -0
- ultralytics/utils/loss.py +99 -61
- ultralytics/utils/metrics.py +649 -478
- ultralytics/utils/nms.py +337 -0
- ultralytics/utils/ops.py +263 -451
- ultralytics/utils/patches.py +70 -31
- ultralytics/utils/plotting.py +253 -223
- ultralytics/utils/tal.py +48 -61
- ultralytics/utils/torch_utils.py +244 -251
- ultralytics/utils/tqdm.py +438 -0
- ultralytics/utils/triton.py +22 -23
- ultralytics/utils/tuner.py +11 -10
- dgenerate_ultralytics_headless-8.3.137.dist-info/RECORD +0 -272
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
|
2
2
|
"""Transformer modules."""
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import math
|
|
5
7
|
|
|
6
8
|
import torch
|
|
@@ -8,26 +10,30 @@ import torch.nn as nn
|
|
|
8
10
|
import torch.nn.functional as F
|
|
9
11
|
from torch.nn.init import constant_, xavier_uniform_
|
|
10
12
|
|
|
13
|
+
from ultralytics.utils.torch_utils import TORCH_1_11
|
|
14
|
+
|
|
11
15
|
from .conv import Conv
|
|
12
16
|
from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
|
|
13
17
|
|
|
14
18
|
__all__ = (
|
|
15
|
-
"TransformerEncoderLayer",
|
|
16
|
-
"TransformerLayer",
|
|
17
|
-
"TransformerBlock",
|
|
18
|
-
"MLPBlock",
|
|
19
|
-
"LayerNorm2d",
|
|
20
19
|
"AIFI",
|
|
20
|
+
"MLP",
|
|
21
21
|
"DeformableTransformerDecoder",
|
|
22
22
|
"DeformableTransformerDecoderLayer",
|
|
23
|
+
"LayerNorm2d",
|
|
24
|
+
"MLPBlock",
|
|
23
25
|
"MSDeformAttn",
|
|
24
|
-
"
|
|
26
|
+
"TransformerBlock",
|
|
27
|
+
"TransformerEncoderLayer",
|
|
28
|
+
"TransformerLayer",
|
|
25
29
|
)
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
class TransformerEncoderLayer(nn.Module):
|
|
29
|
-
"""
|
|
30
|
-
|
|
33
|
+
"""A single layer of the transformer encoder.
|
|
34
|
+
|
|
35
|
+
This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
|
|
36
|
+
supporting both pre-normalization and post-normalization configurations.
|
|
31
37
|
|
|
32
38
|
Attributes:
|
|
33
39
|
ma (nn.MultiheadAttention): Multi-head attention module.
|
|
@@ -42,9 +48,16 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
42
48
|
normalize_before (bool): Whether to apply normalization before attention and feedforward.
|
|
43
49
|
"""
|
|
44
50
|
|
|
45
|
-
def __init__(
|
|
46
|
-
|
|
47
|
-
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
c1: int,
|
|
54
|
+
cm: int = 2048,
|
|
55
|
+
num_heads: int = 8,
|
|
56
|
+
dropout: float = 0.0,
|
|
57
|
+
act: nn.Module = nn.GELU(),
|
|
58
|
+
normalize_before: bool = False,
|
|
59
|
+
):
|
|
60
|
+
"""Initialize the TransformerEncoderLayer with specified parameters.
|
|
48
61
|
|
|
49
62
|
Args:
|
|
50
63
|
c1 (int): Input dimension.
|
|
@@ -76,13 +89,18 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
76
89
|
self.normalize_before = normalize_before
|
|
77
90
|
|
|
78
91
|
@staticmethod
|
|
79
|
-
def with_pos_embed(tensor, pos=None):
|
|
92
|
+
def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None = None) -> torch.Tensor:
|
|
80
93
|
"""Add position embeddings to the tensor if provided."""
|
|
81
94
|
return tensor if pos is None else tensor + pos
|
|
82
95
|
|
|
83
|
-
def forward_post(
|
|
84
|
-
|
|
85
|
-
|
|
96
|
+
def forward_post(
|
|
97
|
+
self,
|
|
98
|
+
src: torch.Tensor,
|
|
99
|
+
src_mask: torch.Tensor | None = None,
|
|
100
|
+
src_key_padding_mask: torch.Tensor | None = None,
|
|
101
|
+
pos: torch.Tensor | None = None,
|
|
102
|
+
) -> torch.Tensor:
|
|
103
|
+
"""Perform forward pass with post-normalization.
|
|
86
104
|
|
|
87
105
|
Args:
|
|
88
106
|
src (torch.Tensor): Input tensor.
|
|
@@ -101,9 +119,14 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
101
119
|
src = src + self.dropout2(src2)
|
|
102
120
|
return self.norm2(src)
|
|
103
121
|
|
|
104
|
-
def forward_pre(
|
|
105
|
-
|
|
106
|
-
|
|
122
|
+
def forward_pre(
|
|
123
|
+
self,
|
|
124
|
+
src: torch.Tensor,
|
|
125
|
+
src_mask: torch.Tensor | None = None,
|
|
126
|
+
src_key_padding_mask: torch.Tensor | None = None,
|
|
127
|
+
pos: torch.Tensor | None = None,
|
|
128
|
+
) -> torch.Tensor:
|
|
129
|
+
"""Perform forward pass with pre-normalization.
|
|
107
130
|
|
|
108
131
|
Args:
|
|
109
132
|
src (torch.Tensor): Input tensor.
|
|
@@ -122,9 +145,14 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
122
145
|
src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
|
|
123
146
|
return src + self.dropout2(src2)
|
|
124
147
|
|
|
125
|
-
def forward(
|
|
126
|
-
|
|
127
|
-
|
|
148
|
+
def forward(
|
|
149
|
+
self,
|
|
150
|
+
src: torch.Tensor,
|
|
151
|
+
src_mask: torch.Tensor | None = None,
|
|
152
|
+
src_key_padding_mask: torch.Tensor | None = None,
|
|
153
|
+
pos: torch.Tensor | None = None,
|
|
154
|
+
) -> torch.Tensor:
|
|
155
|
+
"""Forward propagate the input through the encoder module.
|
|
128
156
|
|
|
129
157
|
Args:
|
|
130
158
|
src (torch.Tensor): Input tensor.
|
|
@@ -141,15 +169,22 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
141
169
|
|
|
142
170
|
|
|
143
171
|
class AIFI(TransformerEncoderLayer):
|
|
144
|
-
"""
|
|
145
|
-
Defines the AIFI transformer layer.
|
|
172
|
+
"""AIFI transformer layer for 2D data with positional embeddings.
|
|
146
173
|
|
|
147
|
-
This class extends TransformerEncoderLayer to work with 2D
|
|
174
|
+
This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
|
|
175
|
+
embeddings and handling the spatial dimensions appropriately.
|
|
148
176
|
"""
|
|
149
177
|
|
|
150
|
-
def __init__(
|
|
151
|
-
|
|
152
|
-
|
|
178
|
+
def __init__(
|
|
179
|
+
self,
|
|
180
|
+
c1: int,
|
|
181
|
+
cm: int = 2048,
|
|
182
|
+
num_heads: int = 8,
|
|
183
|
+
dropout: float = 0,
|
|
184
|
+
act: nn.Module = nn.GELU(),
|
|
185
|
+
normalize_before: bool = False,
|
|
186
|
+
):
|
|
187
|
+
"""Initialize the AIFI instance with specified parameters.
|
|
153
188
|
|
|
154
189
|
Args:
|
|
155
190
|
c1 (int): Input dimension.
|
|
@@ -161,9 +196,8 @@ class AIFI(TransformerEncoderLayer):
|
|
|
161
196
|
"""
|
|
162
197
|
super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
|
|
163
198
|
|
|
164
|
-
def forward(self, x):
|
|
165
|
-
"""
|
|
166
|
-
Forward pass for the AIFI transformer layer.
|
|
199
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
200
|
+
"""Forward pass for the AIFI transformer layer.
|
|
167
201
|
|
|
168
202
|
Args:
|
|
169
203
|
x (torch.Tensor): Input tensor with shape [B, C, H, W].
|
|
@@ -178,9 +212,10 @@ class AIFI(TransformerEncoderLayer):
|
|
|
178
212
|
return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
|
|
179
213
|
|
|
180
214
|
@staticmethod
|
|
181
|
-
def build_2d_sincos_position_embedding(
|
|
182
|
-
|
|
183
|
-
|
|
215
|
+
def build_2d_sincos_position_embedding(
|
|
216
|
+
w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
|
|
217
|
+
) -> torch.Tensor:
|
|
218
|
+
"""Build 2D sine-cosine position embedding.
|
|
184
219
|
|
|
185
220
|
Args:
|
|
186
221
|
w (int): Width of the feature map.
|
|
@@ -194,7 +229,7 @@ class AIFI(TransformerEncoderLayer):
|
|
|
194
229
|
assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
|
|
195
230
|
grid_w = torch.arange(w, dtype=torch.float32)
|
|
196
231
|
grid_h = torch.arange(h, dtype=torch.float32)
|
|
197
|
-
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
|
|
232
|
+
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") if TORCH_1_11 else torch.meshgrid(grid_w, grid_h)
|
|
198
233
|
pos_dim = embed_dim // 4
|
|
199
234
|
omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
|
|
200
235
|
omega = 1.0 / (temperature**omega)
|
|
@@ -208,9 +243,8 @@ class AIFI(TransformerEncoderLayer):
|
|
|
208
243
|
class TransformerLayer(nn.Module):
|
|
209
244
|
"""Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
|
|
210
245
|
|
|
211
|
-
def __init__(self, c, num_heads):
|
|
212
|
-
"""
|
|
213
|
-
Initialize a self-attention mechanism using linear transformations and multi-head attention.
|
|
246
|
+
def __init__(self, c: int, num_heads: int):
|
|
247
|
+
"""Initialize a self-attention mechanism using linear transformations and multi-head attention.
|
|
214
248
|
|
|
215
249
|
Args:
|
|
216
250
|
c (int): Input and output channel dimension.
|
|
@@ -224,9 +258,8 @@ class TransformerLayer(nn.Module):
|
|
|
224
258
|
self.fc1 = nn.Linear(c, c, bias=False)
|
|
225
259
|
self.fc2 = nn.Linear(c, c, bias=False)
|
|
226
260
|
|
|
227
|
-
def forward(self, x):
|
|
228
|
-
"""
|
|
229
|
-
Apply a transformer block to the input x and return the output.
|
|
261
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
262
|
+
"""Apply a transformer block to the input x and return the output.
|
|
230
263
|
|
|
231
264
|
Args:
|
|
232
265
|
x (torch.Tensor): Input tensor.
|
|
@@ -239,8 +272,10 @@ class TransformerLayer(nn.Module):
|
|
|
239
272
|
|
|
240
273
|
|
|
241
274
|
class TransformerBlock(nn.Module):
|
|
242
|
-
"""
|
|
243
|
-
|
|
275
|
+
"""Vision Transformer block based on https://arxiv.org/abs/2010.11929.
|
|
276
|
+
|
|
277
|
+
This class implements a complete transformer block with optional convolution layer for channel adjustment, learnable
|
|
278
|
+
position embedding, and multiple transformer layers.
|
|
244
279
|
|
|
245
280
|
Attributes:
|
|
246
281
|
conv (Conv, optional): Convolution layer if input and output channels differ.
|
|
@@ -249,9 +284,8 @@ class TransformerBlock(nn.Module):
|
|
|
249
284
|
c2 (int): Output channel dimension.
|
|
250
285
|
"""
|
|
251
286
|
|
|
252
|
-
def __init__(self, c1, c2, num_heads, num_layers):
|
|
253
|
-
"""
|
|
254
|
-
Initialize a Transformer module with position embedding and specified number of heads and layers.
|
|
287
|
+
def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
|
|
288
|
+
"""Initialize a Transformer module with position embedding and specified number of heads and layers.
|
|
255
289
|
|
|
256
290
|
Args:
|
|
257
291
|
c1 (int): Input channel dimension.
|
|
@@ -267,9 +301,8 @@ class TransformerBlock(nn.Module):
|
|
|
267
301
|
self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
|
|
268
302
|
self.c2 = c2
|
|
269
303
|
|
|
270
|
-
def forward(self, x):
|
|
271
|
-
"""
|
|
272
|
-
Forward propagates the input through the bottleneck module.
|
|
304
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
305
|
+
"""Forward propagate the input through the transformer block.
|
|
273
306
|
|
|
274
307
|
Args:
|
|
275
308
|
x (torch.Tensor): Input tensor with shape [b, c1, w, h].
|
|
@@ -285,11 +318,10 @@ class TransformerBlock(nn.Module):
|
|
|
285
318
|
|
|
286
319
|
|
|
287
320
|
class MLPBlock(nn.Module):
|
|
288
|
-
"""
|
|
321
|
+
"""A single block of a multi-layer perceptron."""
|
|
289
322
|
|
|
290
|
-
def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
|
|
291
|
-
"""
|
|
292
|
-
Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
|
|
323
|
+
def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
|
|
324
|
+
"""Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
|
|
293
325
|
|
|
294
326
|
Args:
|
|
295
327
|
embedding_dim (int): Input and output dimension.
|
|
@@ -302,8 +334,7 @@ class MLPBlock(nn.Module):
|
|
|
302
334
|
self.act = act()
|
|
303
335
|
|
|
304
336
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
305
|
-
"""
|
|
306
|
-
Forward pass for the MLPBlock.
|
|
337
|
+
"""Forward pass for the MLPBlock.
|
|
307
338
|
|
|
308
339
|
Args:
|
|
309
340
|
x (torch.Tensor): Input tensor.
|
|
@@ -315,8 +346,10 @@ class MLPBlock(nn.Module):
|
|
|
315
346
|
|
|
316
347
|
|
|
317
348
|
class MLP(nn.Module):
|
|
318
|
-
"""
|
|
319
|
-
|
|
349
|
+
"""A simple multi-layer perceptron (also called FFN).
|
|
350
|
+
|
|
351
|
+
This class implements a configurable MLP with multiple linear layers, activation functions, and optional sigmoid
|
|
352
|
+
output activation.
|
|
320
353
|
|
|
321
354
|
Attributes:
|
|
322
355
|
num_layers (int): Number of layers in the MLP.
|
|
@@ -325,9 +358,10 @@ class MLP(nn.Module):
|
|
|
325
358
|
act (nn.Module): Activation function.
|
|
326
359
|
"""
|
|
327
360
|
|
|
328
|
-
def __init__(
|
|
329
|
-
|
|
330
|
-
|
|
361
|
+
def __init__(
|
|
362
|
+
self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
|
|
363
|
+
):
|
|
364
|
+
"""Initialize the MLP with specified input, hidden, output dimensions and number of layers.
|
|
331
365
|
|
|
332
366
|
Args:
|
|
333
367
|
input_dim (int): Input dimension.
|
|
@@ -340,13 +374,12 @@ class MLP(nn.Module):
|
|
|
340
374
|
super().__init__()
|
|
341
375
|
self.num_layers = num_layers
|
|
342
376
|
h = [hidden_dim] * (num_layers - 1)
|
|
343
|
-
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim
|
|
377
|
+
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
|
|
344
378
|
self.sigmoid = sigmoid
|
|
345
379
|
self.act = act()
|
|
346
380
|
|
|
347
|
-
def forward(self, x):
|
|
348
|
-
"""
|
|
349
|
-
Forward pass for the entire MLP.
|
|
381
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
382
|
+
"""Forward pass for the entire MLP.
|
|
350
383
|
|
|
351
384
|
Args:
|
|
352
385
|
x (torch.Tensor): Input tensor.
|
|
@@ -360,23 +393,23 @@ class MLP(nn.Module):
|
|
|
360
393
|
|
|
361
394
|
|
|
362
395
|
class LayerNorm2d(nn.Module):
|
|
363
|
-
"""
|
|
364
|
-
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
|
396
|
+
"""2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
|
365
397
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
and
|
|
369
|
-
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
|
|
398
|
+
This class implements layer normalization for 2D feature maps, normalizing across the channel dimension while
|
|
399
|
+
preserving spatial dimensions.
|
|
370
400
|
|
|
371
401
|
Attributes:
|
|
372
402
|
weight (nn.Parameter): Learnable scale parameter.
|
|
373
403
|
bias (nn.Parameter): Learnable bias parameter.
|
|
374
404
|
eps (float): Small constant for numerical stability.
|
|
405
|
+
|
|
406
|
+
References:
|
|
407
|
+
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
|
|
408
|
+
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
|
|
375
409
|
"""
|
|
376
410
|
|
|
377
|
-
def __init__(self, num_channels, eps=1e-6):
|
|
378
|
-
"""
|
|
379
|
-
Initialize LayerNorm2d with the given parameters.
|
|
411
|
+
def __init__(self, num_channels: int, eps: float = 1e-6):
|
|
412
|
+
"""Initialize LayerNorm2d with the given parameters.
|
|
380
413
|
|
|
381
414
|
Args:
|
|
382
415
|
num_channels (int): Number of channels in the input.
|
|
@@ -387,9 +420,8 @@ class LayerNorm2d(nn.Module):
|
|
|
387
420
|
self.bias = nn.Parameter(torch.zeros(num_channels))
|
|
388
421
|
self.eps = eps
|
|
389
422
|
|
|
390
|
-
def forward(self, x):
|
|
391
|
-
"""
|
|
392
|
-
Perform forward pass for 2D layer normalization.
|
|
423
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
424
|
+
"""Perform forward pass for 2D layer normalization.
|
|
393
425
|
|
|
394
426
|
Args:
|
|
395
427
|
x (torch.Tensor): Input tensor.
|
|
@@ -404,10 +436,10 @@ class LayerNorm2d(nn.Module):
|
|
|
404
436
|
|
|
405
437
|
|
|
406
438
|
class MSDeformAttn(nn.Module):
|
|
407
|
-
"""
|
|
408
|
-
Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
|
439
|
+
"""Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
|
409
440
|
|
|
410
|
-
|
|
441
|
+
This module implements multiscale deformable attention that can attend to features at multiple scales with learnable
|
|
442
|
+
sampling locations and attention weights.
|
|
411
443
|
|
|
412
444
|
Attributes:
|
|
413
445
|
im2col_step (int): Step size for im2col operations.
|
|
@@ -419,11 +451,13 @@ class MSDeformAttn(nn.Module):
|
|
|
419
451
|
attention_weights (nn.Linear): Linear layer for generating attention weights.
|
|
420
452
|
value_proj (nn.Linear): Linear layer for projecting values.
|
|
421
453
|
output_proj (nn.Linear): Linear layer for projecting output.
|
|
454
|
+
|
|
455
|
+
References:
|
|
456
|
+
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
|
|
422
457
|
"""
|
|
423
458
|
|
|
424
|
-
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
|
|
425
|
-
"""
|
|
426
|
-
Initialize MSDeformAttn with the given parameters.
|
|
459
|
+
def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
|
|
460
|
+
"""Initialize MSDeformAttn with the given parameters.
|
|
427
461
|
|
|
428
462
|
Args:
|
|
429
463
|
d_model (int): Model dimension.
|
|
@@ -473,23 +507,30 @@ class MSDeformAttn(nn.Module):
|
|
|
473
507
|
xavier_uniform_(self.output_proj.weight.data)
|
|
474
508
|
constant_(self.output_proj.bias.data, 0.0)
|
|
475
509
|
|
|
476
|
-
def forward(
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
510
|
+
def forward(
|
|
511
|
+
self,
|
|
512
|
+
query: torch.Tensor,
|
|
513
|
+
refer_bbox: torch.Tensor,
|
|
514
|
+
value: torch.Tensor,
|
|
515
|
+
value_shapes: list,
|
|
516
|
+
value_mask: torch.Tensor | None = None,
|
|
517
|
+
) -> torch.Tensor:
|
|
518
|
+
"""Perform forward pass for multiscale deformable attention.
|
|
481
519
|
|
|
482
520
|
Args:
|
|
483
|
-
query (torch.Tensor):
|
|
484
|
-
refer_bbox (torch.Tensor):
|
|
485
|
-
top-left (0,0), bottom-right (1, 1), including padding area.
|
|
486
|
-
value (torch.Tensor):
|
|
521
|
+
query (torch.Tensor): Query tensor with shape [bs, query_length, C].
|
|
522
|
+
refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2], range in [0,
|
|
523
|
+
1], top-left (0,0), bottom-right (1, 1), including padding area.
|
|
524
|
+
value (torch.Tensor): Value tensor with shape [bs, value_length, C].
|
|
487
525
|
value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
|
|
488
|
-
value_mask (torch.Tensor, optional):
|
|
489
|
-
False for padding elements.
|
|
526
|
+
value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
|
|
527
|
+
elements, False for padding elements.
|
|
490
528
|
|
|
491
529
|
Returns:
|
|
492
530
|
(torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
|
|
531
|
+
|
|
532
|
+
References:
|
|
533
|
+
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
|
493
534
|
"""
|
|
494
535
|
bs, len_q = query.shape[:2]
|
|
495
536
|
len_v = value.shape[1]
|
|
@@ -518,11 +559,10 @@ class MSDeformAttn(nn.Module):
|
|
|
518
559
|
|
|
519
560
|
|
|
520
561
|
class DeformableTransformerDecoderLayer(nn.Module):
|
|
521
|
-
"""
|
|
522
|
-
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
|
|
562
|
+
"""Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
|
|
523
563
|
|
|
524
|
-
|
|
525
|
-
|
|
564
|
+
This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
|
|
565
|
+
attention, and a feedforward network.
|
|
526
566
|
|
|
527
567
|
Attributes:
|
|
528
568
|
self_attn (nn.MultiheadAttention): Self-attention module.
|
|
@@ -537,11 +577,23 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
537
577
|
linear2 (nn.Linear): Second linear layer in the feedforward network.
|
|
538
578
|
dropout4 (nn.Dropout): Dropout after the feedforward network.
|
|
539
579
|
norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
|
|
580
|
+
|
|
581
|
+
References:
|
|
582
|
+
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
|
583
|
+
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
|
|
540
584
|
"""
|
|
541
585
|
|
|
542
|
-
def __init__(
|
|
543
|
-
|
|
544
|
-
|
|
586
|
+
def __init__(
|
|
587
|
+
self,
|
|
588
|
+
d_model: int = 256,
|
|
589
|
+
n_heads: int = 8,
|
|
590
|
+
d_ffn: int = 1024,
|
|
591
|
+
dropout: float = 0.0,
|
|
592
|
+
act: nn.Module = nn.ReLU(),
|
|
593
|
+
n_levels: int = 4,
|
|
594
|
+
n_points: int = 4,
|
|
595
|
+
):
|
|
596
|
+
"""Initialize the DeformableTransformerDecoderLayer with the given parameters.
|
|
545
597
|
|
|
546
598
|
Args:
|
|
547
599
|
d_model (int): Model dimension.
|
|
@@ -573,13 +625,12 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
573
625
|
self.norm3 = nn.LayerNorm(d_model)
|
|
574
626
|
|
|
575
627
|
@staticmethod
|
|
576
|
-
def with_pos_embed(tensor, pos):
|
|
628
|
+
def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None) -> torch.Tensor:
|
|
577
629
|
"""Add positional embeddings to the input tensor, if provided."""
|
|
578
630
|
return tensor if pos is None else tensor + pos
|
|
579
631
|
|
|
580
|
-
def forward_ffn(self, tgt):
|
|
581
|
-
"""
|
|
582
|
-
Perform forward pass through the Feed-Forward Network part of the layer.
|
|
632
|
+
def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
|
|
633
|
+
"""Perform forward pass through the Feed-Forward Network part of the layer.
|
|
583
634
|
|
|
584
635
|
Args:
|
|
585
636
|
tgt (torch.Tensor): Input tensor.
|
|
@@ -591,9 +642,17 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
591
642
|
tgt = tgt + self.dropout4(tgt2)
|
|
592
643
|
return self.norm3(tgt)
|
|
593
644
|
|
|
594
|
-
def forward(
|
|
595
|
-
|
|
596
|
-
|
|
645
|
+
def forward(
|
|
646
|
+
self,
|
|
647
|
+
embed: torch.Tensor,
|
|
648
|
+
refer_bbox: torch.Tensor,
|
|
649
|
+
feats: torch.Tensor,
|
|
650
|
+
shapes: list,
|
|
651
|
+
padding_mask: torch.Tensor | None = None,
|
|
652
|
+
attn_mask: torch.Tensor | None = None,
|
|
653
|
+
query_pos: torch.Tensor | None = None,
|
|
654
|
+
) -> torch.Tensor:
|
|
655
|
+
"""Perform the forward pass through the entire decoder layer.
|
|
597
656
|
|
|
598
657
|
Args:
|
|
599
658
|
embed (torch.Tensor): Input embeddings.
|
|
@@ -627,21 +686,23 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
627
686
|
|
|
628
687
|
|
|
629
688
|
class DeformableTransformerDecoder(nn.Module):
|
|
630
|
-
"""
|
|
631
|
-
Implementation of Deformable Transformer Decoder based on PaddleDetection.
|
|
689
|
+
"""Deformable Transformer Decoder based on PaddleDetection implementation.
|
|
632
690
|
|
|
633
|
-
|
|
691
|
+
This class implements a complete deformable transformer decoder with multiple decoder layers and prediction heads
|
|
692
|
+
for bounding box regression and classification.
|
|
634
693
|
|
|
635
694
|
Attributes:
|
|
636
695
|
layers (nn.ModuleList): List of decoder layers.
|
|
637
696
|
num_layers (int): Number of decoder layers.
|
|
638
697
|
hidden_dim (int): Hidden dimension.
|
|
639
698
|
eval_idx (int): Index of the layer to use during evaluation.
|
|
699
|
+
|
|
700
|
+
References:
|
|
701
|
+
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
|
|
640
702
|
"""
|
|
641
703
|
|
|
642
|
-
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx
|
|
643
|
-
"""
|
|
644
|
-
Initialize the DeformableTransformerDecoder with the given parameters.
|
|
704
|
+
def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
|
|
705
|
+
"""Initialize the DeformableTransformerDecoder with the given parameters.
|
|
645
706
|
|
|
646
707
|
Args:
|
|
647
708
|
hidden_dim (int): Hidden dimension.
|
|
@@ -657,18 +718,17 @@ class DeformableTransformerDecoder(nn.Module):
|
|
|
657
718
|
|
|
658
719
|
def forward(
|
|
659
720
|
self,
|
|
660
|
-
embed, # decoder embeddings
|
|
661
|
-
refer_bbox, # anchor
|
|
662
|
-
feats, # image features
|
|
663
|
-
shapes, # feature shapes
|
|
664
|
-
bbox_head,
|
|
665
|
-
score_head,
|
|
666
|
-
pos_mlp,
|
|
667
|
-
attn_mask=None,
|
|
668
|
-
padding_mask=None,
|
|
721
|
+
embed: torch.Tensor, # decoder embeddings
|
|
722
|
+
refer_bbox: torch.Tensor, # anchor
|
|
723
|
+
feats: torch.Tensor, # image features
|
|
724
|
+
shapes: list, # feature shapes
|
|
725
|
+
bbox_head: nn.Module,
|
|
726
|
+
score_head: nn.Module,
|
|
727
|
+
pos_mlp: nn.Module,
|
|
728
|
+
attn_mask: torch.Tensor | None = None,
|
|
729
|
+
padding_mask: torch.Tensor | None = None,
|
|
669
730
|
):
|
|
670
|
-
"""
|
|
671
|
-
Perform the forward pass through the entire decoder.
|
|
731
|
+
"""Perform the forward pass through the entire decoder.
|
|
672
732
|
|
|
673
733
|
Args:
|
|
674
734
|
embed (torch.Tensor): Decoder embeddings.
|
ultralytics/nn/modules/utils.py
CHANGED
|
@@ -9,12 +9,11 @@ import torch.nn as nn
|
|
|
9
9
|
import torch.nn.functional as F
|
|
10
10
|
from torch.nn.init import uniform_
|
|
11
11
|
|
|
12
|
-
__all__ = "
|
|
12
|
+
__all__ = "inverse_sigmoid", "multi_scale_deformable_attn_pytorch"
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def _get_clones(module, n):
|
|
16
|
-
"""
|
|
17
|
-
Create a list of cloned modules from the given module.
|
|
16
|
+
"""Create a list of cloned modules from the given module.
|
|
18
17
|
|
|
19
18
|
Args:
|
|
20
19
|
module (nn.Module): The module to be cloned.
|
|
@@ -34,12 +33,11 @@ def _get_clones(module, n):
|
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
def bias_init_with_prob(prior_prob=0.01):
|
|
37
|
-
"""
|
|
38
|
-
Initialize conv/fc bias value according to a given probability value.
|
|
36
|
+
"""Initialize conv/fc bias value according to a given probability value.
|
|
39
37
|
|
|
40
|
-
This function calculates the bias initialization value based on a prior probability using the inverse error
|
|
41
|
-
It's commonly used in object detection models to initialize classification layers with a specific positive
|
|
42
|
-
probability.
|
|
38
|
+
This function calculates the bias initialization value based on a prior probability using the inverse error
|
|
39
|
+
function. It's commonly used in object detection models to initialize classification layers with a specific positive
|
|
40
|
+
prediction probability.
|
|
43
41
|
|
|
44
42
|
Args:
|
|
45
43
|
prior_prob (float, optional): Prior probability for bias initialization.
|
|
@@ -56,11 +54,10 @@ def bias_init_with_prob(prior_prob=0.01):
|
|
|
56
54
|
|
|
57
55
|
|
|
58
56
|
def linear_init(module):
|
|
59
|
-
"""
|
|
60
|
-
Initialize the weights and biases of a linear module.
|
|
57
|
+
"""Initialize the weights and biases of a linear module.
|
|
61
58
|
|
|
62
|
-
This function initializes the weights of a linear module using a uniform distribution within bounds calculated
|
|
63
|
-
|
|
59
|
+
This function initializes the weights of a linear module using a uniform distribution within bounds calculated from
|
|
60
|
+
the input dimension. If the module has a bias, it is also initialized.
|
|
64
61
|
|
|
65
62
|
Args:
|
|
66
63
|
module (nn.Module): Linear module to initialize.
|
|
@@ -80,8 +77,7 @@ def linear_init(module):
|
|
|
80
77
|
|
|
81
78
|
|
|
82
79
|
def inverse_sigmoid(x, eps=1e-5):
|
|
83
|
-
"""
|
|
84
|
-
Calculate the inverse sigmoid function for a tensor.
|
|
80
|
+
"""Calculate the inverse sigmoid function for a tensor.
|
|
85
81
|
|
|
86
82
|
This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
|
|
87
83
|
operations, particularly in attention mechanisms and coordinate transformations.
|
|
@@ -110,8 +106,7 @@ def multi_scale_deformable_attn_pytorch(
|
|
|
110
106
|
sampling_locations: torch.Tensor,
|
|
111
107
|
attention_weights: torch.Tensor,
|
|
112
108
|
) -> torch.Tensor:
|
|
113
|
-
"""
|
|
114
|
-
Implement multi-scale deformable attention in PyTorch.
|
|
109
|
+
"""Implement multi-scale deformable attention in PyTorch.
|
|
115
110
|
|
|
116
111
|
This function performs deformable attention across multiple feature map scales, allowing the model to attend to
|
|
117
112
|
different spatial locations with learned offsets.
|
|
@@ -119,10 +114,10 @@ def multi_scale_deformable_attn_pytorch(
|
|
|
119
114
|
Args:
|
|
120
115
|
value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
|
|
121
116
|
value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
|
|
122
|
-
sampling_locations (torch.Tensor): The sampling locations with shape
|
|
123
|
-
|
|
124
|
-
attention_weights (torch.Tensor): The attention weights with shape
|
|
125
|
-
|
|
117
|
+
sampling_locations (torch.Tensor): The sampling locations with shape (bs, num_queries, num_heads, num_levels,
|
|
118
|
+
num_points, 2).
|
|
119
|
+
attention_weights (torch.Tensor): The attention weights with shape (bs, num_queries, num_heads, num_levels,
|
|
120
|
+
num_points).
|
|
126
121
|
|
|
127
122
|
Returns:
|
|
128
123
|
(torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
|