dgenerate-ultralytics-headless 8.3.196__py3-none-any.whl → 8.3.248__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/METADATA +33 -34
- dgenerate_ultralytics_headless-8.3.248.dist-info/RECORD +298 -0
- tests/__init__.py +5 -7
- tests/conftest.py +8 -15
- tests/test_cli.py +8 -10
- tests/test_cuda.py +9 -10
- tests/test_engine.py +29 -2
- tests/test_exports.py +69 -21
- tests/test_integrations.py +8 -11
- tests/test_python.py +109 -71
- tests/test_solutions.py +170 -159
- ultralytics/__init__.py +27 -9
- ultralytics/cfg/__init__.py +57 -64
- ultralytics/cfg/datasets/Argoverse.yaml +7 -6
- ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
- ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
- ultralytics/cfg/datasets/ImageNet.yaml +1 -1
- ultralytics/cfg/datasets/Objects365.yaml +19 -15
- ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
- ultralytics/cfg/datasets/VOC.yaml +19 -21
- ultralytics/cfg/datasets/VisDrone.yaml +5 -5
- ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
- ultralytics/cfg/datasets/coco-pose.yaml +24 -2
- ultralytics/cfg/datasets/coco.yaml +2 -2
- ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
- ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
- ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
- ultralytics/cfg/datasets/dog-pose.yaml +28 -0
- ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
- ultralytics/cfg/datasets/dota8.yaml +2 -2
- ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
- ultralytics/cfg/datasets/kitti.yaml +27 -0
- ultralytics/cfg/datasets/lvis.yaml +7 -7
- ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
- ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
- ultralytics/cfg/datasets/xView.yaml +16 -16
- ultralytics/cfg/default.yaml +96 -94
- ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
- ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
- ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
- ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
- ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
- ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
- ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
- ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
- ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
- ultralytics/cfg/models/v6/yolov6.yaml +1 -1
- ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
- ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
- ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
- ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
- ultralytics/cfg/trackers/botsort.yaml +16 -17
- ultralytics/cfg/trackers/bytetrack.yaml +9 -11
- ultralytics/data/__init__.py +4 -4
- ultralytics/data/annotator.py +3 -4
- ultralytics/data/augment.py +286 -476
- ultralytics/data/base.py +18 -26
- ultralytics/data/build.py +151 -26
- ultralytics/data/converter.py +38 -50
- ultralytics/data/dataset.py +47 -75
- ultralytics/data/loaders.py +42 -49
- ultralytics/data/split.py +5 -6
- ultralytics/data/split_dota.py +8 -15
- ultralytics/data/utils.py +41 -45
- ultralytics/engine/exporter.py +462 -462
- ultralytics/engine/model.py +150 -191
- ultralytics/engine/predictor.py +30 -40
- ultralytics/engine/results.py +177 -311
- ultralytics/engine/trainer.py +193 -120
- ultralytics/engine/tuner.py +77 -63
- ultralytics/engine/validator.py +39 -22
- ultralytics/hub/__init__.py +16 -19
- ultralytics/hub/auth.py +6 -12
- ultralytics/hub/google/__init__.py +7 -10
- ultralytics/hub/session.py +15 -25
- ultralytics/hub/utils.py +5 -8
- ultralytics/models/__init__.py +1 -1
- ultralytics/models/fastsam/__init__.py +1 -1
- ultralytics/models/fastsam/model.py +8 -10
- ultralytics/models/fastsam/predict.py +19 -30
- ultralytics/models/fastsam/utils.py +1 -2
- ultralytics/models/fastsam/val.py +5 -7
- ultralytics/models/nas/__init__.py +1 -1
- ultralytics/models/nas/model.py +5 -8
- ultralytics/models/nas/predict.py +7 -9
- ultralytics/models/nas/val.py +1 -2
- ultralytics/models/rtdetr/__init__.py +1 -1
- ultralytics/models/rtdetr/model.py +7 -8
- ultralytics/models/rtdetr/predict.py +15 -19
- ultralytics/models/rtdetr/train.py +10 -13
- ultralytics/models/rtdetr/val.py +21 -23
- ultralytics/models/sam/__init__.py +15 -2
- ultralytics/models/sam/amg.py +14 -20
- ultralytics/models/sam/build.py +26 -19
- ultralytics/models/sam/build_sam3.py +377 -0
- ultralytics/models/sam/model.py +29 -32
- ultralytics/models/sam/modules/blocks.py +83 -144
- ultralytics/models/sam/modules/decoders.py +22 -40
- ultralytics/models/sam/modules/encoders.py +44 -101
- ultralytics/models/sam/modules/memory_attention.py +16 -30
- ultralytics/models/sam/modules/sam.py +206 -79
- ultralytics/models/sam/modules/tiny_encoder.py +64 -83
- ultralytics/models/sam/modules/transformer.py +18 -28
- ultralytics/models/sam/modules/utils.py +174 -50
- ultralytics/models/sam/predict.py +2268 -366
- ultralytics/models/sam/sam3/__init__.py +3 -0
- ultralytics/models/sam/sam3/decoder.py +546 -0
- ultralytics/models/sam/sam3/encoder.py +529 -0
- ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
- ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
- ultralytics/models/sam/sam3/model_misc.py +199 -0
- ultralytics/models/sam/sam3/necks.py +129 -0
- ultralytics/models/sam/sam3/sam3_image.py +339 -0
- ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
- ultralytics/models/sam/sam3/vitdet.py +547 -0
- ultralytics/models/sam/sam3/vl_combiner.py +160 -0
- ultralytics/models/utils/loss.py +14 -26
- ultralytics/models/utils/ops.py +13 -17
- ultralytics/models/yolo/__init__.py +1 -1
- ultralytics/models/yolo/classify/predict.py +9 -12
- ultralytics/models/yolo/classify/train.py +15 -41
- ultralytics/models/yolo/classify/val.py +34 -32
- ultralytics/models/yolo/detect/predict.py +8 -11
- ultralytics/models/yolo/detect/train.py +13 -32
- ultralytics/models/yolo/detect/val.py +75 -63
- ultralytics/models/yolo/model.py +37 -53
- ultralytics/models/yolo/obb/predict.py +5 -14
- ultralytics/models/yolo/obb/train.py +11 -14
- ultralytics/models/yolo/obb/val.py +42 -39
- ultralytics/models/yolo/pose/__init__.py +1 -1
- ultralytics/models/yolo/pose/predict.py +7 -22
- ultralytics/models/yolo/pose/train.py +10 -22
- ultralytics/models/yolo/pose/val.py +40 -59
- ultralytics/models/yolo/segment/predict.py +16 -20
- ultralytics/models/yolo/segment/train.py +3 -12
- ultralytics/models/yolo/segment/val.py +106 -56
- ultralytics/models/yolo/world/train.py +12 -16
- ultralytics/models/yolo/world/train_world.py +11 -34
- ultralytics/models/yolo/yoloe/__init__.py +7 -7
- ultralytics/models/yolo/yoloe/predict.py +16 -23
- ultralytics/models/yolo/yoloe/train.py +31 -56
- ultralytics/models/yolo/yoloe/train_seg.py +5 -10
- ultralytics/models/yolo/yoloe/val.py +16 -21
- ultralytics/nn/__init__.py +7 -7
- ultralytics/nn/autobackend.py +152 -80
- ultralytics/nn/modules/__init__.py +60 -60
- ultralytics/nn/modules/activation.py +4 -6
- ultralytics/nn/modules/block.py +133 -217
- ultralytics/nn/modules/conv.py +52 -97
- ultralytics/nn/modules/head.py +64 -116
- ultralytics/nn/modules/transformer.py +79 -89
- ultralytics/nn/modules/utils.py +16 -21
- ultralytics/nn/tasks.py +111 -156
- ultralytics/nn/text_model.py +40 -67
- ultralytics/solutions/__init__.py +12 -12
- ultralytics/solutions/ai_gym.py +11 -17
- ultralytics/solutions/analytics.py +15 -16
- ultralytics/solutions/config.py +5 -6
- ultralytics/solutions/distance_calculation.py +10 -13
- ultralytics/solutions/heatmap.py +7 -13
- ultralytics/solutions/instance_segmentation.py +5 -8
- ultralytics/solutions/object_blurrer.py +7 -10
- ultralytics/solutions/object_counter.py +12 -19
- ultralytics/solutions/object_cropper.py +8 -14
- ultralytics/solutions/parking_management.py +33 -31
- ultralytics/solutions/queue_management.py +10 -12
- ultralytics/solutions/region_counter.py +9 -12
- ultralytics/solutions/security_alarm.py +15 -20
- ultralytics/solutions/similarity_search.py +13 -17
- ultralytics/solutions/solutions.py +75 -74
- ultralytics/solutions/speed_estimation.py +7 -10
- ultralytics/solutions/streamlit_inference.py +4 -7
- ultralytics/solutions/templates/similarity-search.html +7 -18
- ultralytics/solutions/trackzone.py +7 -10
- ultralytics/solutions/vision_eye.py +5 -8
- ultralytics/trackers/__init__.py +1 -1
- ultralytics/trackers/basetrack.py +3 -5
- ultralytics/trackers/bot_sort.py +10 -27
- ultralytics/trackers/byte_tracker.py +14 -30
- ultralytics/trackers/track.py +3 -6
- ultralytics/trackers/utils/gmc.py +11 -22
- ultralytics/trackers/utils/kalman_filter.py +37 -48
- ultralytics/trackers/utils/matching.py +12 -15
- ultralytics/utils/__init__.py +116 -116
- ultralytics/utils/autobatch.py +2 -4
- ultralytics/utils/autodevice.py +17 -18
- ultralytics/utils/benchmarks.py +70 -70
- ultralytics/utils/callbacks/base.py +8 -10
- ultralytics/utils/callbacks/clearml.py +5 -13
- ultralytics/utils/callbacks/comet.py +32 -46
- ultralytics/utils/callbacks/dvc.py +13 -18
- ultralytics/utils/callbacks/mlflow.py +4 -5
- ultralytics/utils/callbacks/neptune.py +7 -15
- ultralytics/utils/callbacks/platform.py +314 -38
- ultralytics/utils/callbacks/raytune.py +3 -4
- ultralytics/utils/callbacks/tensorboard.py +23 -31
- ultralytics/utils/callbacks/wb.py +10 -13
- ultralytics/utils/checks.py +151 -87
- ultralytics/utils/cpu.py +3 -8
- ultralytics/utils/dist.py +19 -15
- ultralytics/utils/downloads.py +29 -41
- ultralytics/utils/errors.py +6 -14
- ultralytics/utils/events.py +2 -4
- ultralytics/utils/export/__init__.py +7 -0
- ultralytics/utils/{export.py → export/engine.py} +16 -16
- ultralytics/utils/export/imx.py +325 -0
- ultralytics/utils/export/tensorflow.py +231 -0
- ultralytics/utils/files.py +24 -28
- ultralytics/utils/git.py +9 -11
- ultralytics/utils/instance.py +30 -51
- ultralytics/utils/logger.py +212 -114
- ultralytics/utils/loss.py +15 -24
- ultralytics/utils/metrics.py +131 -160
- ultralytics/utils/nms.py +21 -30
- ultralytics/utils/ops.py +107 -165
- ultralytics/utils/patches.py +33 -21
- ultralytics/utils/plotting.py +122 -119
- ultralytics/utils/tal.py +28 -44
- ultralytics/utils/torch_utils.py +70 -187
- ultralytics/utils/tqdm.py +20 -20
- ultralytics/utils/triton.py +13 -19
- ultralytics/utils/tuner.py +17 -5
- dgenerate_ultralytics_headless-8.3.196.dist-info/RECORD +0 -281
- {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/top_level.txt +0 -0
|
@@ -10,26 +10,27 @@ import torch.nn as nn
|
|
|
10
10
|
import torch.nn.functional as F
|
|
11
11
|
from torch.nn.init import constant_, xavier_uniform_
|
|
12
12
|
|
|
13
|
+
from ultralytics.utils.torch_utils import TORCH_1_11
|
|
14
|
+
|
|
13
15
|
from .conv import Conv
|
|
14
16
|
from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
|
|
15
17
|
|
|
16
18
|
__all__ = (
|
|
17
|
-
"TransformerEncoderLayer",
|
|
18
|
-
"TransformerLayer",
|
|
19
|
-
"TransformerBlock",
|
|
20
|
-
"MLPBlock",
|
|
21
|
-
"LayerNorm2d",
|
|
22
19
|
"AIFI",
|
|
20
|
+
"MLP",
|
|
23
21
|
"DeformableTransformerDecoder",
|
|
24
22
|
"DeformableTransformerDecoderLayer",
|
|
23
|
+
"LayerNorm2d",
|
|
24
|
+
"MLPBlock",
|
|
25
25
|
"MSDeformAttn",
|
|
26
|
-
"
|
|
26
|
+
"TransformerBlock",
|
|
27
|
+
"TransformerEncoderLayer",
|
|
28
|
+
"TransformerLayer",
|
|
27
29
|
)
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
class TransformerEncoderLayer(nn.Module):
|
|
31
|
-
"""
|
|
32
|
-
A single layer of the transformer encoder.
|
|
33
|
+
"""A single layer of the transformer encoder.
|
|
33
34
|
|
|
34
35
|
This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
|
|
35
36
|
supporting both pre-normalization and post-normalization configurations.
|
|
@@ -56,8 +57,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
56
57
|
act: nn.Module = nn.GELU(),
|
|
57
58
|
normalize_before: bool = False,
|
|
58
59
|
):
|
|
59
|
-
"""
|
|
60
|
-
Initialize the TransformerEncoderLayer with specified parameters.
|
|
60
|
+
"""Initialize the TransformerEncoderLayer with specified parameters.
|
|
61
61
|
|
|
62
62
|
Args:
|
|
63
63
|
c1 (int): Input dimension.
|
|
@@ -100,8 +100,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
100
100
|
src_key_padding_mask: torch.Tensor | None = None,
|
|
101
101
|
pos: torch.Tensor | None = None,
|
|
102
102
|
) -> torch.Tensor:
|
|
103
|
-
"""
|
|
104
|
-
Perform forward pass with post-normalization.
|
|
103
|
+
"""Perform forward pass with post-normalization.
|
|
105
104
|
|
|
106
105
|
Args:
|
|
107
106
|
src (torch.Tensor): Input tensor.
|
|
@@ -127,8 +126,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
127
126
|
src_key_padding_mask: torch.Tensor | None = None,
|
|
128
127
|
pos: torch.Tensor | None = None,
|
|
129
128
|
) -> torch.Tensor:
|
|
130
|
-
"""
|
|
131
|
-
Perform forward pass with pre-normalization.
|
|
129
|
+
"""Perform forward pass with pre-normalization.
|
|
132
130
|
|
|
133
131
|
Args:
|
|
134
132
|
src (torch.Tensor): Input tensor.
|
|
@@ -154,8 +152,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
154
152
|
src_key_padding_mask: torch.Tensor | None = None,
|
|
155
153
|
pos: torch.Tensor | None = None,
|
|
156
154
|
) -> torch.Tensor:
|
|
157
|
-
"""
|
|
158
|
-
Forward propagate the input through the encoder module.
|
|
155
|
+
"""Forward propagate the input through the encoder module.
|
|
159
156
|
|
|
160
157
|
Args:
|
|
161
158
|
src (torch.Tensor): Input tensor.
|
|
@@ -172,8 +169,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
172
169
|
|
|
173
170
|
|
|
174
171
|
class AIFI(TransformerEncoderLayer):
|
|
175
|
-
"""
|
|
176
|
-
AIFI transformer layer for 2D data with positional embeddings.
|
|
172
|
+
"""AIFI transformer layer for 2D data with positional embeddings.
|
|
177
173
|
|
|
178
174
|
This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
|
|
179
175
|
embeddings and handling the spatial dimensions appropriately.
|
|
@@ -188,8 +184,7 @@ class AIFI(TransformerEncoderLayer):
|
|
|
188
184
|
act: nn.Module = nn.GELU(),
|
|
189
185
|
normalize_before: bool = False,
|
|
190
186
|
):
|
|
191
|
-
"""
|
|
192
|
-
Initialize the AIFI instance with specified parameters.
|
|
187
|
+
"""Initialize the AIFI instance with specified parameters.
|
|
193
188
|
|
|
194
189
|
Args:
|
|
195
190
|
c1 (int): Input dimension.
|
|
@@ -202,8 +197,7 @@ class AIFI(TransformerEncoderLayer):
|
|
|
202
197
|
super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
|
|
203
198
|
|
|
204
199
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
205
|
-
"""
|
|
206
|
-
Forward pass for the AIFI transformer layer.
|
|
200
|
+
"""Forward pass for the AIFI transformer layer.
|
|
207
201
|
|
|
208
202
|
Args:
|
|
209
203
|
x (torch.Tensor): Input tensor with shape [B, C, H, W].
|
|
@@ -221,8 +215,7 @@ class AIFI(TransformerEncoderLayer):
|
|
|
221
215
|
def build_2d_sincos_position_embedding(
|
|
222
216
|
w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
|
|
223
217
|
) -> torch.Tensor:
|
|
224
|
-
"""
|
|
225
|
-
Build 2D sine-cosine position embedding.
|
|
218
|
+
"""Build 2D sine-cosine position embedding.
|
|
226
219
|
|
|
227
220
|
Args:
|
|
228
221
|
w (int): Width of the feature map.
|
|
@@ -236,7 +229,7 @@ class AIFI(TransformerEncoderLayer):
|
|
|
236
229
|
assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
|
|
237
230
|
grid_w = torch.arange(w, dtype=torch.float32)
|
|
238
231
|
grid_h = torch.arange(h, dtype=torch.float32)
|
|
239
|
-
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
|
|
232
|
+
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") if TORCH_1_11 else torch.meshgrid(grid_w, grid_h)
|
|
240
233
|
pos_dim = embed_dim // 4
|
|
241
234
|
omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
|
|
242
235
|
omega = 1.0 / (temperature**omega)
|
|
@@ -251,8 +244,7 @@ class TransformerLayer(nn.Module):
|
|
|
251
244
|
"""Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
|
|
252
245
|
|
|
253
246
|
def __init__(self, c: int, num_heads: int):
|
|
254
|
-
"""
|
|
255
|
-
Initialize a self-attention mechanism using linear transformations and multi-head attention.
|
|
247
|
+
"""Initialize a self-attention mechanism using linear transformations and multi-head attention.
|
|
256
248
|
|
|
257
249
|
Args:
|
|
258
250
|
c (int): Input and output channel dimension.
|
|
@@ -267,8 +259,7 @@ class TransformerLayer(nn.Module):
|
|
|
267
259
|
self.fc2 = nn.Linear(c, c, bias=False)
|
|
268
260
|
|
|
269
261
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
270
|
-
"""
|
|
271
|
-
Apply a transformer block to the input x and return the output.
|
|
262
|
+
"""Apply a transformer block to the input x and return the output.
|
|
272
263
|
|
|
273
264
|
Args:
|
|
274
265
|
x (torch.Tensor): Input tensor.
|
|
@@ -281,11 +272,10 @@ class TransformerLayer(nn.Module):
|
|
|
281
272
|
|
|
282
273
|
|
|
283
274
|
class TransformerBlock(nn.Module):
|
|
284
|
-
"""
|
|
285
|
-
Vision Transformer block based on https://arxiv.org/abs/2010.11929.
|
|
275
|
+
"""Vision Transformer block based on https://arxiv.org/abs/2010.11929.
|
|
286
276
|
|
|
287
|
-
This class implements a complete transformer block with optional convolution layer for channel adjustment,
|
|
288
|
-
|
|
277
|
+
This class implements a complete transformer block with optional convolution layer for channel adjustment, learnable
|
|
278
|
+
position embedding, and multiple transformer layers.
|
|
289
279
|
|
|
290
280
|
Attributes:
|
|
291
281
|
conv (Conv, optional): Convolution layer if input and output channels differ.
|
|
@@ -295,8 +285,7 @@ class TransformerBlock(nn.Module):
|
|
|
295
285
|
"""
|
|
296
286
|
|
|
297
287
|
def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
|
|
298
|
-
"""
|
|
299
|
-
Initialize a Transformer module with position embedding and specified number of heads and layers.
|
|
288
|
+
"""Initialize a Transformer module with position embedding and specified number of heads and layers.
|
|
300
289
|
|
|
301
290
|
Args:
|
|
302
291
|
c1 (int): Input channel dimension.
|
|
@@ -313,28 +302,26 @@ class TransformerBlock(nn.Module):
|
|
|
313
302
|
self.c2 = c2
|
|
314
303
|
|
|
315
304
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
316
|
-
"""
|
|
317
|
-
Forward propagate the input through the transformer block.
|
|
305
|
+
"""Forward propagate the input through the transformer block.
|
|
318
306
|
|
|
319
307
|
Args:
|
|
320
|
-
x (torch.Tensor): Input tensor with shape [b, c1,
|
|
308
|
+
x (torch.Tensor): Input tensor with shape [b, c1, h, w].
|
|
321
309
|
|
|
322
310
|
Returns:
|
|
323
|
-
(torch.Tensor): Output tensor with shape [b, c2,
|
|
311
|
+
(torch.Tensor): Output tensor with shape [b, c2, h, w].
|
|
324
312
|
"""
|
|
325
313
|
if self.conv is not None:
|
|
326
314
|
x = self.conv(x)
|
|
327
|
-
b, _,
|
|
315
|
+
b, _, h, w = x.shape
|
|
328
316
|
p = x.flatten(2).permute(2, 0, 1)
|
|
329
|
-
return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2,
|
|
317
|
+
return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, h, w)
|
|
330
318
|
|
|
331
319
|
|
|
332
320
|
class MLPBlock(nn.Module):
|
|
333
321
|
"""A single block of a multi-layer perceptron."""
|
|
334
322
|
|
|
335
323
|
def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
|
|
336
|
-
"""
|
|
337
|
-
Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
|
|
324
|
+
"""Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
|
|
338
325
|
|
|
339
326
|
Args:
|
|
340
327
|
embedding_dim (int): Input and output dimension.
|
|
@@ -347,8 +334,7 @@ class MLPBlock(nn.Module):
|
|
|
347
334
|
self.act = act()
|
|
348
335
|
|
|
349
336
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
350
|
-
"""
|
|
351
|
-
Forward pass for the MLPBlock.
|
|
337
|
+
"""Forward pass for the MLPBlock.
|
|
352
338
|
|
|
353
339
|
Args:
|
|
354
340
|
x (torch.Tensor): Input tensor.
|
|
@@ -360,11 +346,10 @@ class MLPBlock(nn.Module):
|
|
|
360
346
|
|
|
361
347
|
|
|
362
348
|
class MLP(nn.Module):
|
|
363
|
-
"""
|
|
364
|
-
A simple multi-layer perceptron (also called FFN).
|
|
349
|
+
"""A simple multi-layer perceptron (also called FFN).
|
|
365
350
|
|
|
366
|
-
This class implements a configurable MLP with multiple linear layers, activation functions, and optional
|
|
367
|
-
|
|
351
|
+
This class implements a configurable MLP with multiple linear layers, activation functions, and optional sigmoid
|
|
352
|
+
output activation.
|
|
368
353
|
|
|
369
354
|
Attributes:
|
|
370
355
|
num_layers (int): Number of layers in the MLP.
|
|
@@ -374,10 +359,17 @@ class MLP(nn.Module):
|
|
|
374
359
|
"""
|
|
375
360
|
|
|
376
361
|
def __init__(
|
|
377
|
-
self,
|
|
362
|
+
self,
|
|
363
|
+
input_dim: int,
|
|
364
|
+
hidden_dim: int,
|
|
365
|
+
output_dim: int,
|
|
366
|
+
num_layers: int,
|
|
367
|
+
act=nn.ReLU,
|
|
368
|
+
sigmoid: bool = False,
|
|
369
|
+
residual: bool = False,
|
|
370
|
+
out_norm: nn.Module = None,
|
|
378
371
|
):
|
|
379
|
-
"""
|
|
380
|
-
Initialize the MLP with specified input, hidden, output dimensions and number of layers.
|
|
372
|
+
"""Initialize the MLP with specified input, hidden, output dimensions and number of layers.
|
|
381
373
|
|
|
382
374
|
Args:
|
|
383
375
|
input_dim (int): Input dimension.
|
|
@@ -386,17 +378,24 @@ class MLP(nn.Module):
|
|
|
386
378
|
num_layers (int): Number of layers.
|
|
387
379
|
act (nn.Module): Activation function.
|
|
388
380
|
sigmoid (bool): Whether to apply sigmoid to the output.
|
|
381
|
+
residual (bool): Whether to use residual connections.
|
|
382
|
+
out_norm (nn.Module, optional): Normalization layer for the output.
|
|
389
383
|
"""
|
|
390
384
|
super().__init__()
|
|
391
385
|
self.num_layers = num_layers
|
|
392
386
|
h = [hidden_dim] * (num_layers - 1)
|
|
393
|
-
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim
|
|
387
|
+
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
|
|
394
388
|
self.sigmoid = sigmoid
|
|
395
389
|
self.act = act()
|
|
390
|
+
if residual and input_dim != output_dim:
|
|
391
|
+
raise ValueError("residual is only supported if input_dim == output_dim")
|
|
392
|
+
self.residual = residual
|
|
393
|
+
# whether to apply a normalization layer to the output
|
|
394
|
+
assert isinstance(out_norm, nn.Module) or out_norm is None
|
|
395
|
+
self.out_norm = out_norm or nn.Identity()
|
|
396
396
|
|
|
397
397
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
398
|
-
"""
|
|
399
|
-
Forward pass for the entire MLP.
|
|
398
|
+
"""Forward pass for the entire MLP.
|
|
400
399
|
|
|
401
400
|
Args:
|
|
402
401
|
x (torch.Tensor): Input tensor.
|
|
@@ -404,17 +403,20 @@ class MLP(nn.Module):
|
|
|
404
403
|
Returns:
|
|
405
404
|
(torch.Tensor): Output tensor after MLP.
|
|
406
405
|
"""
|
|
406
|
+
orig_x = x
|
|
407
407
|
for i, layer in enumerate(self.layers):
|
|
408
408
|
x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
|
|
409
|
+
if getattr(self, "residual", False):
|
|
410
|
+
x = x + orig_x
|
|
411
|
+
x = getattr(self, "out_norm", nn.Identity())(x)
|
|
409
412
|
return x.sigmoid() if getattr(self, "sigmoid", False) else x
|
|
410
413
|
|
|
411
414
|
|
|
412
415
|
class LayerNorm2d(nn.Module):
|
|
413
|
-
"""
|
|
414
|
-
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
|
416
|
+
"""2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
|
|
415
417
|
|
|
416
|
-
This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
|
|
417
|
-
|
|
418
|
+
This class implements layer normalization for 2D feature maps, normalizing across the channel dimension while
|
|
419
|
+
preserving spatial dimensions.
|
|
418
420
|
|
|
419
421
|
Attributes:
|
|
420
422
|
weight (nn.Parameter): Learnable scale parameter.
|
|
@@ -427,8 +429,7 @@ class LayerNorm2d(nn.Module):
|
|
|
427
429
|
"""
|
|
428
430
|
|
|
429
431
|
def __init__(self, num_channels: int, eps: float = 1e-6):
|
|
430
|
-
"""
|
|
431
|
-
Initialize LayerNorm2d with the given parameters.
|
|
432
|
+
"""Initialize LayerNorm2d with the given parameters.
|
|
432
433
|
|
|
433
434
|
Args:
|
|
434
435
|
num_channels (int): Number of channels in the input.
|
|
@@ -440,8 +441,7 @@ class LayerNorm2d(nn.Module):
|
|
|
440
441
|
self.eps = eps
|
|
441
442
|
|
|
442
443
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
443
|
-
"""
|
|
444
|
-
Perform forward pass for 2D layer normalization.
|
|
444
|
+
"""Perform forward pass for 2D layer normalization.
|
|
445
445
|
|
|
446
446
|
Args:
|
|
447
447
|
x (torch.Tensor): Input tensor.
|
|
@@ -456,11 +456,10 @@ class LayerNorm2d(nn.Module):
|
|
|
456
456
|
|
|
457
457
|
|
|
458
458
|
class MSDeformAttn(nn.Module):
|
|
459
|
-
"""
|
|
460
|
-
Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
|
459
|
+
"""Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
|
|
461
460
|
|
|
462
|
-
This module implements multiscale deformable attention that can attend to features at multiple scales
|
|
463
|
-
|
|
461
|
+
This module implements multiscale deformable attention that can attend to features at multiple scales with learnable
|
|
462
|
+
sampling locations and attention weights.
|
|
464
463
|
|
|
465
464
|
Attributes:
|
|
466
465
|
im2col_step (int): Step size for im2col operations.
|
|
@@ -478,8 +477,7 @@ class MSDeformAttn(nn.Module):
|
|
|
478
477
|
"""
|
|
479
478
|
|
|
480
479
|
def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
|
|
481
|
-
"""
|
|
482
|
-
Initialize MSDeformAttn with the given parameters.
|
|
480
|
+
"""Initialize MSDeformAttn with the given parameters.
|
|
483
481
|
|
|
484
482
|
Args:
|
|
485
483
|
d_model (int): Model dimension.
|
|
@@ -537,13 +535,12 @@ class MSDeformAttn(nn.Module):
|
|
|
537
535
|
value_shapes: list,
|
|
538
536
|
value_mask: torch.Tensor | None = None,
|
|
539
537
|
) -> torch.Tensor:
|
|
540
|
-
"""
|
|
541
|
-
Perform forward pass for multiscale deformable attention.
|
|
538
|
+
"""Perform forward pass for multiscale deformable attention.
|
|
542
539
|
|
|
543
540
|
Args:
|
|
544
541
|
query (torch.Tensor): Query tensor with shape [bs, query_length, C].
|
|
545
|
-
refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
|
|
546
|
-
|
|
542
|
+
refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2], range in [0,
|
|
543
|
+
1], top-left (0,0), bottom-right (1, 1), including padding area.
|
|
547
544
|
value (torch.Tensor): Value tensor with shape [bs, value_length, C].
|
|
548
545
|
value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
|
|
549
546
|
value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
|
|
@@ -582,8 +579,7 @@ class MSDeformAttn(nn.Module):
|
|
|
582
579
|
|
|
583
580
|
|
|
584
581
|
class DeformableTransformerDecoderLayer(nn.Module):
|
|
585
|
-
"""
|
|
586
|
-
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
|
|
582
|
+
"""Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
|
|
587
583
|
|
|
588
584
|
This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
|
|
589
585
|
attention, and a feedforward network.
|
|
@@ -617,8 +613,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
617
613
|
n_levels: int = 4,
|
|
618
614
|
n_points: int = 4,
|
|
619
615
|
):
|
|
620
|
-
"""
|
|
621
|
-
Initialize the DeformableTransformerDecoderLayer with the given parameters.
|
|
616
|
+
"""Initialize the DeformableTransformerDecoderLayer with the given parameters.
|
|
622
617
|
|
|
623
618
|
Args:
|
|
624
619
|
d_model (int): Model dimension.
|
|
@@ -655,8 +650,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
655
650
|
return tensor if pos is None else tensor + pos
|
|
656
651
|
|
|
657
652
|
def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
|
|
658
|
-
"""
|
|
659
|
-
Perform forward pass through the Feed-Forward Network part of the layer.
|
|
653
|
+
"""Perform forward pass through the Feed-Forward Network part of the layer.
|
|
660
654
|
|
|
661
655
|
Args:
|
|
662
656
|
tgt (torch.Tensor): Input tensor.
|
|
@@ -678,8 +672,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
678
672
|
attn_mask: torch.Tensor | None = None,
|
|
679
673
|
query_pos: torch.Tensor | None = None,
|
|
680
674
|
) -> torch.Tensor:
|
|
681
|
-
"""
|
|
682
|
-
Perform the forward pass through the entire decoder layer.
|
|
675
|
+
"""Perform the forward pass through the entire decoder layer.
|
|
683
676
|
|
|
684
677
|
Args:
|
|
685
678
|
embed (torch.Tensor): Input embeddings.
|
|
@@ -713,11 +706,10 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
713
706
|
|
|
714
707
|
|
|
715
708
|
class DeformableTransformerDecoder(nn.Module):
|
|
716
|
-
"""
|
|
717
|
-
Deformable Transformer Decoder based on PaddleDetection implementation.
|
|
709
|
+
"""Deformable Transformer Decoder based on PaddleDetection implementation.
|
|
718
710
|
|
|
719
|
-
This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
|
|
720
|
-
|
|
711
|
+
This class implements a complete deformable transformer decoder with multiple decoder layers and prediction heads
|
|
712
|
+
for bounding box regression and classification.
|
|
721
713
|
|
|
722
714
|
Attributes:
|
|
723
715
|
layers (nn.ModuleList): List of decoder layers.
|
|
@@ -730,8 +722,7 @@ class DeformableTransformerDecoder(nn.Module):
|
|
|
730
722
|
"""
|
|
731
723
|
|
|
732
724
|
def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
|
|
733
|
-
"""
|
|
734
|
-
Initialize the DeformableTransformerDecoder with the given parameters.
|
|
725
|
+
"""Initialize the DeformableTransformerDecoder with the given parameters.
|
|
735
726
|
|
|
736
727
|
Args:
|
|
737
728
|
hidden_dim (int): Hidden dimension.
|
|
@@ -757,8 +748,7 @@ class DeformableTransformerDecoder(nn.Module):
|
|
|
757
748
|
attn_mask: torch.Tensor | None = None,
|
|
758
749
|
padding_mask: torch.Tensor | None = None,
|
|
759
750
|
):
|
|
760
|
-
"""
|
|
761
|
-
Perform the forward pass through the entire decoder.
|
|
751
|
+
"""Perform the forward pass through the entire decoder.
|
|
762
752
|
|
|
763
753
|
Args:
|
|
764
754
|
embed (torch.Tensor): Decoder embeddings.
|
ultralytics/nn/modules/utils.py
CHANGED
|
@@ -9,12 +9,11 @@ import torch.nn as nn
|
|
|
9
9
|
import torch.nn.functional as F
|
|
10
10
|
from torch.nn.init import uniform_
|
|
11
11
|
|
|
12
|
-
__all__ = "
|
|
12
|
+
__all__ = "inverse_sigmoid", "multi_scale_deformable_attn_pytorch"
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def _get_clones(module, n):
|
|
16
|
-
"""
|
|
17
|
-
Create a list of cloned modules from the given module.
|
|
16
|
+
"""Create a list of cloned modules from the given module.
|
|
18
17
|
|
|
19
18
|
Args:
|
|
20
19
|
module (nn.Module): The module to be cloned.
|
|
@@ -34,12 +33,11 @@ def _get_clones(module, n):
|
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
def bias_init_with_prob(prior_prob=0.01):
|
|
37
|
-
"""
|
|
38
|
-
Initialize conv/fc bias value according to a given probability value.
|
|
36
|
+
"""Initialize conv/fc bias value according to a given probability value.
|
|
39
37
|
|
|
40
|
-
This function calculates the bias initialization value based on a prior probability using the inverse error
|
|
41
|
-
It's commonly used in object detection models to initialize classification layers with a specific positive
|
|
42
|
-
probability.
|
|
38
|
+
This function calculates the bias initialization value based on a prior probability using the inverse error
|
|
39
|
+
function. It's commonly used in object detection models to initialize classification layers with a specific positive
|
|
40
|
+
prediction probability.
|
|
43
41
|
|
|
44
42
|
Args:
|
|
45
43
|
prior_prob (float, optional): Prior probability for bias initialization.
|
|
@@ -56,11 +54,10 @@ def bias_init_with_prob(prior_prob=0.01):
|
|
|
56
54
|
|
|
57
55
|
|
|
58
56
|
def linear_init(module):
|
|
59
|
-
"""
|
|
60
|
-
Initialize the weights and biases of a linear module.
|
|
57
|
+
"""Initialize the weights and biases of a linear module.
|
|
61
58
|
|
|
62
|
-
This function initializes the weights of a linear module using a uniform distribution within bounds calculated
|
|
63
|
-
|
|
59
|
+
This function initializes the weights of a linear module using a uniform distribution within bounds calculated from
|
|
60
|
+
the input dimension. If the module has a bias, it is also initialized.
|
|
64
61
|
|
|
65
62
|
Args:
|
|
66
63
|
module (nn.Module): Linear module to initialize.
|
|
@@ -80,8 +77,7 @@ def linear_init(module):
|
|
|
80
77
|
|
|
81
78
|
|
|
82
79
|
def inverse_sigmoid(x, eps=1e-5):
|
|
83
|
-
"""
|
|
84
|
-
Calculate the inverse sigmoid function for a tensor.
|
|
80
|
+
"""Calculate the inverse sigmoid function for a tensor.
|
|
85
81
|
|
|
86
82
|
This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
|
|
87
83
|
operations, particularly in attention mechanisms and coordinate transformations.
|
|
@@ -110,8 +106,7 @@ def multi_scale_deformable_attn_pytorch(
|
|
|
110
106
|
sampling_locations: torch.Tensor,
|
|
111
107
|
attention_weights: torch.Tensor,
|
|
112
108
|
) -> torch.Tensor:
|
|
113
|
-
"""
|
|
114
|
-
Implement multi-scale deformable attention in PyTorch.
|
|
109
|
+
"""Implement multi-scale deformable attention in PyTorch.
|
|
115
110
|
|
|
116
111
|
This function performs deformable attention across multiple feature map scales, allowing the model to attend to
|
|
117
112
|
different spatial locations with learned offsets.
|
|
@@ -119,10 +114,10 @@ def multi_scale_deformable_attn_pytorch(
|
|
|
119
114
|
Args:
|
|
120
115
|
value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
|
|
121
116
|
value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
|
|
122
|
-
sampling_locations (torch.Tensor): The sampling locations with shape
|
|
123
|
-
|
|
124
|
-
attention_weights (torch.Tensor): The attention weights with shape
|
|
125
|
-
|
|
117
|
+
sampling_locations (torch.Tensor): The sampling locations with shape (bs, num_queries, num_heads, num_levels,
|
|
118
|
+
num_points, 2).
|
|
119
|
+
attention_weights (torch.Tensor): The attention weights with shape (bs, num_queries, num_heads, num_levels,
|
|
120
|
+
num_points).
|
|
126
121
|
|
|
127
122
|
Returns:
|
|
128
123
|
(torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
|
|
@@ -152,7 +147,7 @@ def multi_scale_deformable_attn_pytorch(
|
|
|
152
147
|
sampling_value_list.append(sampling_value_l_)
|
|
153
148
|
# (bs, num_queries, num_heads, num_levels, num_points) ->
|
|
154
149
|
# (bs, num_heads, num_queries, num_levels, num_points) ->
|
|
155
|
-
# (bs
|
|
150
|
+
# (bs*num_heads, 1, num_queries, num_levels*num_points)
|
|
156
151
|
attention_weights = attention_weights.transpose(1, 2).reshape(
|
|
157
152
|
bs * num_heads, 1, num_queries, num_levels * num_points
|
|
158
153
|
)
|