ultralytics 8.0.238__py3-none-any.whl → 8.0.239__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultralytics might be problematic. Click here for more details.
- ultralytics/__init__.py +2 -2
- ultralytics/cfg/__init__.py +241 -138
- ultralytics/data/__init__.py +9 -2
- ultralytics/data/annotator.py +4 -4
- ultralytics/data/augment.py +186 -169
- ultralytics/data/base.py +54 -48
- ultralytics/data/build.py +34 -23
- ultralytics/data/converter.py +242 -70
- ultralytics/data/dataset.py +117 -95
- ultralytics/data/explorer/__init__.py +3 -1
- ultralytics/data/explorer/explorer.py +120 -100
- ultralytics/data/explorer/gui/__init__.py +1 -0
- ultralytics/data/explorer/gui/dash.py +123 -89
- ultralytics/data/explorer/utils.py +37 -39
- ultralytics/data/loaders.py +75 -62
- ultralytics/data/split_dota.py +44 -36
- ultralytics/data/utils.py +160 -142
- ultralytics/engine/exporter.py +348 -292
- ultralytics/engine/model.py +102 -66
- ultralytics/engine/predictor.py +74 -55
- ultralytics/engine/results.py +61 -41
- ultralytics/engine/trainer.py +192 -144
- ultralytics/engine/tuner.py +66 -59
- ultralytics/engine/validator.py +31 -26
- ultralytics/hub/__init__.py +54 -31
- ultralytics/hub/auth.py +28 -25
- ultralytics/hub/session.py +282 -133
- ultralytics/hub/utils.py +64 -42
- ultralytics/models/__init__.py +1 -1
- ultralytics/models/fastsam/__init__.py +1 -1
- ultralytics/models/fastsam/model.py +6 -6
- ultralytics/models/fastsam/predict.py +3 -2
- ultralytics/models/fastsam/prompt.py +55 -48
- ultralytics/models/fastsam/val.py +1 -1
- ultralytics/models/nas/__init__.py +1 -1
- ultralytics/models/nas/model.py +9 -8
- ultralytics/models/nas/predict.py +8 -6
- ultralytics/models/nas/val.py +11 -9
- ultralytics/models/rtdetr/__init__.py +1 -1
- ultralytics/models/rtdetr/model.py +11 -9
- ultralytics/models/rtdetr/train.py +18 -16
- ultralytics/models/rtdetr/val.py +25 -19
- ultralytics/models/sam/__init__.py +1 -1
- ultralytics/models/sam/amg.py +13 -14
- ultralytics/models/sam/build.py +44 -42
- ultralytics/models/sam/model.py +6 -6
- ultralytics/models/sam/modules/decoders.py +6 -4
- ultralytics/models/sam/modules/encoders.py +37 -35
- ultralytics/models/sam/modules/sam.py +5 -4
- ultralytics/models/sam/modules/tiny_encoder.py +95 -73
- ultralytics/models/sam/modules/transformer.py +3 -2
- ultralytics/models/sam/predict.py +39 -27
- ultralytics/models/utils/loss.py +99 -95
- ultralytics/models/utils/ops.py +34 -31
- ultralytics/models/yolo/__init__.py +1 -1
- ultralytics/models/yolo/classify/__init__.py +1 -1
- ultralytics/models/yolo/classify/predict.py +8 -6
- ultralytics/models/yolo/classify/train.py +37 -31
- ultralytics/models/yolo/classify/val.py +26 -24
- ultralytics/models/yolo/detect/__init__.py +1 -1
- ultralytics/models/yolo/detect/predict.py +8 -6
- ultralytics/models/yolo/detect/train.py +47 -37
- ultralytics/models/yolo/detect/val.py +100 -82
- ultralytics/models/yolo/model.py +31 -25
- ultralytics/models/yolo/obb/__init__.py +1 -1
- ultralytics/models/yolo/obb/predict.py +13 -11
- ultralytics/models/yolo/obb/train.py +3 -3
- ultralytics/models/yolo/obb/val.py +70 -59
- ultralytics/models/yolo/pose/__init__.py +1 -1
- ultralytics/models/yolo/pose/predict.py +17 -12
- ultralytics/models/yolo/pose/train.py +28 -25
- ultralytics/models/yolo/pose/val.py +91 -64
- ultralytics/models/yolo/segment/__init__.py +1 -1
- ultralytics/models/yolo/segment/predict.py +10 -8
- ultralytics/models/yolo/segment/train.py +16 -15
- ultralytics/models/yolo/segment/val.py +90 -68
- ultralytics/nn/__init__.py +26 -6
- ultralytics/nn/autobackend.py +144 -112
- ultralytics/nn/modules/__init__.py +96 -13
- ultralytics/nn/modules/block.py +28 -7
- ultralytics/nn/modules/conv.py +41 -23
- ultralytics/nn/modules/head.py +60 -52
- ultralytics/nn/modules/transformer.py +49 -32
- ultralytics/nn/modules/utils.py +20 -15
- ultralytics/nn/tasks.py +215 -141
- ultralytics/solutions/ai_gym.py +59 -47
- ultralytics/solutions/distance_calculation.py +17 -14
- ultralytics/solutions/heatmap.py +57 -55
- ultralytics/solutions/object_counter.py +46 -39
- ultralytics/solutions/speed_estimation.py +13 -16
- ultralytics/trackers/__init__.py +1 -1
- ultralytics/trackers/basetrack.py +1 -0
- ultralytics/trackers/bot_sort.py +2 -1
- ultralytics/trackers/byte_tracker.py +10 -7
- ultralytics/trackers/track.py +7 -7
- ultralytics/trackers/utils/gmc.py +25 -25
- ultralytics/trackers/utils/kalman_filter.py +85 -42
- ultralytics/trackers/utils/matching.py +8 -7
- ultralytics/utils/__init__.py +173 -152
- ultralytics/utils/autobatch.py +10 -10
- ultralytics/utils/benchmarks.py +76 -86
- ultralytics/utils/callbacks/__init__.py +1 -1
- ultralytics/utils/callbacks/base.py +29 -29
- ultralytics/utils/callbacks/clearml.py +51 -43
- ultralytics/utils/callbacks/comet.py +81 -66
- ultralytics/utils/callbacks/dvc.py +33 -26
- ultralytics/utils/callbacks/hub.py +44 -26
- ultralytics/utils/callbacks/mlflow.py +31 -24
- ultralytics/utils/callbacks/neptune.py +35 -25
- ultralytics/utils/callbacks/raytune.py +9 -4
- ultralytics/utils/callbacks/tensorboard.py +16 -11
- ultralytics/utils/callbacks/wb.py +39 -33
- ultralytics/utils/checks.py +189 -141
- ultralytics/utils/dist.py +15 -12
- ultralytics/utils/downloads.py +112 -96
- ultralytics/utils/errors.py +1 -1
- ultralytics/utils/files.py +11 -11
- ultralytics/utils/instance.py +22 -22
- ultralytics/utils/loss.py +117 -67
- ultralytics/utils/metrics.py +224 -158
- ultralytics/utils/ops.py +38 -28
- ultralytics/utils/patches.py +3 -3
- ultralytics/utils/plotting.py +217 -120
- ultralytics/utils/tal.py +19 -13
- ultralytics/utils/torch_utils.py +138 -109
- ultralytics/utils/triton.py +12 -10
- ultralytics/utils/tuner.py +49 -47
- {ultralytics-8.0.238.dist-info → ultralytics-8.0.239.dist-info}/METADATA +2 -1
- ultralytics-8.0.239.dist-info/RECORD +188 -0
- ultralytics-8.0.238.dist-info/RECORD +0 -188
- {ultralytics-8.0.238.dist-info → ultralytics-8.0.239.dist-info}/LICENSE +0 -0
- {ultralytics-8.0.238.dist-info → ultralytics-8.0.239.dist-info}/WHEEL +0 -0
- {ultralytics-8.0.238.dist-info → ultralytics-8.0.239.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.0.238.dist-info → ultralytics-8.0.239.dist-info}/top_level.txt +0 -0
ultralytics/nn/modules/block.py
CHANGED
|
@@ -8,8 +8,26 @@ import torch.nn.functional as F
|
|
|
8
8
|
from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
|
|
9
9
|
from .transformer import TransformerBlock
|
|
10
10
|
|
|
11
|
-
__all__ = (
|
|
12
|
-
|
|
11
|
+
__all__ = (
|
|
12
|
+
"DFL",
|
|
13
|
+
"HGBlock",
|
|
14
|
+
"HGStem",
|
|
15
|
+
"SPP",
|
|
16
|
+
"SPPF",
|
|
17
|
+
"C1",
|
|
18
|
+
"C2",
|
|
19
|
+
"C3",
|
|
20
|
+
"C2f",
|
|
21
|
+
"C3x",
|
|
22
|
+
"C3TR",
|
|
23
|
+
"C3Ghost",
|
|
24
|
+
"GhostBottleneck",
|
|
25
|
+
"Bottleneck",
|
|
26
|
+
"BottleneckCSP",
|
|
27
|
+
"Proto",
|
|
28
|
+
"RepC3",
|
|
29
|
+
"ResNetLayer",
|
|
30
|
+
)
|
|
13
31
|
|
|
14
32
|
|
|
15
33
|
class DFL(nn.Module):
|
|
@@ -284,9 +302,11 @@ class GhostBottleneck(nn.Module):
|
|
|
284
302
|
self.conv = nn.Sequential(
|
|
285
303
|
GhostConv(c1, c_, 1, 1), # pw
|
|
286
304
|
DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
|
|
287
|
-
GhostConv(c_, c2, 1, 1, act=False)
|
|
288
|
-
|
|
289
|
-
|
|
305
|
+
GhostConv(c_, c2, 1, 1, act=False), # pw-linear
|
|
306
|
+
)
|
|
307
|
+
self.shortcut = (
|
|
308
|
+
nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
|
|
309
|
+
)
|
|
290
310
|
|
|
291
311
|
def forward(self, x):
|
|
292
312
|
"""Applies skip connection and concatenation to input tensor."""
|
|
@@ -359,8 +379,9 @@ class ResNetLayer(nn.Module):
|
|
|
359
379
|
self.is_first = is_first
|
|
360
380
|
|
|
361
381
|
if self.is_first:
|
|
362
|
-
self.layer = nn.Sequential(
|
|
363
|
-
|
|
382
|
+
self.layer = nn.Sequential(
|
|
383
|
+
Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
|
384
|
+
)
|
|
364
385
|
else:
|
|
365
386
|
blocks = [ResNetBlock(c1, c2, s, e=e)]
|
|
366
387
|
blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
|
ultralytics/nn/modules/conv.py
CHANGED
|
@@ -7,8 +7,21 @@ import numpy as np
|
|
|
7
7
|
import torch
|
|
8
8
|
import torch.nn as nn
|
|
9
9
|
|
|
10
|
-
__all__ = (
|
|
11
|
-
|
|
10
|
+
__all__ = (
|
|
11
|
+
"Conv",
|
|
12
|
+
"Conv2",
|
|
13
|
+
"LightConv",
|
|
14
|
+
"DWConv",
|
|
15
|
+
"DWConvTranspose2d",
|
|
16
|
+
"ConvTranspose",
|
|
17
|
+
"Focus",
|
|
18
|
+
"GhostConv",
|
|
19
|
+
"ChannelAttention",
|
|
20
|
+
"SpatialAttention",
|
|
21
|
+
"CBAM",
|
|
22
|
+
"Concat",
|
|
23
|
+
"RepConv",
|
|
24
|
+
)
|
|
12
25
|
|
|
13
26
|
|
|
14
27
|
def autopad(k, p=None, d=1): # kernel, padding, dilation
|
|
@@ -22,6 +35,7 @@ def autopad(k, p=None, d=1): # kernel, padding, dilation
|
|
|
22
35
|
|
|
23
36
|
class Conv(nn.Module):
|
|
24
37
|
"""Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
|
|
38
|
+
|
|
25
39
|
default_act = nn.SiLU() # default activation
|
|
26
40
|
|
|
27
41
|
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
|
|
@@ -60,9 +74,9 @@ class Conv2(Conv):
|
|
|
60
74
|
"""Fuse parallel convolutions."""
|
|
61
75
|
w = torch.zeros_like(self.conv.weight.data)
|
|
62
76
|
i = [x // 2 for x in w.shape[2:]]
|
|
63
|
-
w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
|
|
77
|
+
w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
|
|
64
78
|
self.conv.weight.data += w
|
|
65
|
-
self.__delattr__(
|
|
79
|
+
self.__delattr__("cv2")
|
|
66
80
|
self.forward = self.forward_fuse
|
|
67
81
|
|
|
68
82
|
|
|
@@ -102,6 +116,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d):
|
|
|
102
116
|
|
|
103
117
|
class ConvTranspose(nn.Module):
|
|
104
118
|
"""Convolution transpose 2d layer."""
|
|
119
|
+
|
|
105
120
|
default_act = nn.SiLU() # default activation
|
|
106
121
|
|
|
107
122
|
def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
|
|
@@ -164,6 +179,7 @@ class RepConv(nn.Module):
|
|
|
164
179
|
This module is used in RT-DETR.
|
|
165
180
|
Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
|
|
166
181
|
"""
|
|
182
|
+
|
|
167
183
|
default_act = nn.SiLU() # default activation
|
|
168
184
|
|
|
169
185
|
def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
|
|
@@ -214,7 +230,7 @@ class RepConv(nn.Module):
|
|
|
214
230
|
beta = branch.bn.bias
|
|
215
231
|
eps = branch.bn.eps
|
|
216
232
|
elif isinstance(branch, nn.BatchNorm2d):
|
|
217
|
-
if not hasattr(self,
|
|
233
|
+
if not hasattr(self, "id_tensor"):
|
|
218
234
|
input_dim = self.c1 // self.g
|
|
219
235
|
kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
|
|
220
236
|
for i in range(self.c1):
|
|
@@ -232,29 +248,31 @@ class RepConv(nn.Module):
|
|
|
232
248
|
|
|
233
249
|
def fuse_convs(self):
|
|
234
250
|
"""Combines two convolution layers into a single layer and removes unused attributes from the class."""
|
|
235
|
-
if hasattr(self,
|
|
251
|
+
if hasattr(self, "conv"):
|
|
236
252
|
return
|
|
237
253
|
kernel, bias = self.get_equivalent_kernel_bias()
|
|
238
|
-
self.conv = nn.Conv2d(
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
254
|
+
self.conv = nn.Conv2d(
|
|
255
|
+
in_channels=self.conv1.conv.in_channels,
|
|
256
|
+
out_channels=self.conv1.conv.out_channels,
|
|
257
|
+
kernel_size=self.conv1.conv.kernel_size,
|
|
258
|
+
stride=self.conv1.conv.stride,
|
|
259
|
+
padding=self.conv1.conv.padding,
|
|
260
|
+
dilation=self.conv1.conv.dilation,
|
|
261
|
+
groups=self.conv1.conv.groups,
|
|
262
|
+
bias=True,
|
|
263
|
+
).requires_grad_(False)
|
|
246
264
|
self.conv.weight.data = kernel
|
|
247
265
|
self.conv.bias.data = bias
|
|
248
266
|
for para in self.parameters():
|
|
249
267
|
para.detach_()
|
|
250
|
-
self.__delattr__(
|
|
251
|
-
self.__delattr__(
|
|
252
|
-
if hasattr(self,
|
|
253
|
-
self.__delattr__(
|
|
254
|
-
if hasattr(self,
|
|
255
|
-
self.__delattr__(
|
|
256
|
-
if hasattr(self,
|
|
257
|
-
self.__delattr__(
|
|
268
|
+
self.__delattr__("conv1")
|
|
269
|
+
self.__delattr__("conv2")
|
|
270
|
+
if hasattr(self, "nm"):
|
|
271
|
+
self.__delattr__("nm")
|
|
272
|
+
if hasattr(self, "bn"):
|
|
273
|
+
self.__delattr__("bn")
|
|
274
|
+
if hasattr(self, "id_tensor"):
|
|
275
|
+
self.__delattr__("id_tensor")
|
|
258
276
|
|
|
259
277
|
|
|
260
278
|
class ChannelAttention(nn.Module):
|
|
@@ -278,7 +296,7 @@ class SpatialAttention(nn.Module):
|
|
|
278
296
|
def __init__(self, kernel_size=7):
|
|
279
297
|
"""Initialize Spatial-attention module with kernel size argument."""
|
|
280
298
|
super().__init__()
|
|
281
|
-
assert kernel_size in (3, 7),
|
|
299
|
+
assert kernel_size in (3, 7), "kernel size must be 3 or 7"
|
|
282
300
|
padding = 3 if kernel_size == 7 else 1
|
|
283
301
|
self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
|
|
284
302
|
self.act = nn.Sigmoid()
|
ultralytics/nn/modules/head.py
CHANGED
|
@@ -14,11 +14,12 @@ from .conv import Conv
|
|
|
14
14
|
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
|
|
15
15
|
from .utils import bias_init_with_prob, linear_init_
|
|
16
16
|
|
|
17
|
-
__all__ =
|
|
17
|
+
__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class Detect(nn.Module):
|
|
21
21
|
"""YOLOv8 Detect head for detection models."""
|
|
22
|
+
|
|
22
23
|
dynamic = False # force grid reconstruction
|
|
23
24
|
export = False # export mode
|
|
24
25
|
shape = None
|
|
@@ -35,7 +36,8 @@ class Detect(nn.Module):
|
|
|
35
36
|
self.stride = torch.zeros(self.nl) # strides computed during build
|
|
36
37
|
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
|
|
37
38
|
self.cv2 = nn.ModuleList(
|
|
38
|
-
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
|
|
39
|
+
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
|
|
40
|
+
)
|
|
39
41
|
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
|
|
40
42
|
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
|
|
41
43
|
|
|
@@ -53,14 +55,14 @@ class Detect(nn.Module):
|
|
|
53
55
|
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
|
|
54
56
|
self.shape = shape
|
|
55
57
|
|
|
56
|
-
if self.export and self.format in (
|
|
57
|
-
box = x_cat[:, :self.reg_max * 4]
|
|
58
|
-
cls = x_cat[:, self.reg_max * 4:]
|
|
58
|
+
if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"): # avoid TF FlexSplitV ops
|
|
59
|
+
box = x_cat[:, : self.reg_max * 4]
|
|
60
|
+
cls = x_cat[:, self.reg_max * 4 :]
|
|
59
61
|
else:
|
|
60
62
|
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
|
|
61
63
|
dbox = self.decode_bboxes(box)
|
|
62
64
|
|
|
63
|
-
if self.export and self.format in (
|
|
65
|
+
if self.export and self.format in ("tflite", "edgetpu"):
|
|
64
66
|
# Precompute normalization factor to increase numerical stability
|
|
65
67
|
# See https://github.com/ultralytics/ultralytics/issues/7371
|
|
66
68
|
img_h = shape[2]
|
|
@@ -79,7 +81,7 @@ class Detect(nn.Module):
|
|
|
79
81
|
# ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
|
|
80
82
|
for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
|
|
81
83
|
a[-1].bias.data[:] = 1.0 # box
|
|
82
|
-
b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
|
84
|
+
b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
|
83
85
|
|
|
84
86
|
def decode_bboxes(self, bboxes):
|
|
85
87
|
"""Decode bounding boxes."""
|
|
@@ -116,6 +118,7 @@ class OBB(Detect):
|
|
|
116
118
|
"""YOLOv8 OBB detection head for detection with rotation models."""
|
|
117
119
|
|
|
118
120
|
def __init__(self, nc=80, ne=1, ch=()):
|
|
121
|
+
"""Initialize OBB with number of classes `nc` and layer channels `ch`."""
|
|
119
122
|
super().__init__(nc, ch)
|
|
120
123
|
self.ne = ne # number of extra parameters
|
|
121
124
|
self.detect = Detect.forward
|
|
@@ -124,6 +127,7 @@ class OBB(Detect):
|
|
|
124
127
|
self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.ne, 1)) for x in ch)
|
|
125
128
|
|
|
126
129
|
def forward(self, x):
|
|
130
|
+
"""Concatenates and returns predicted bounding boxes and class probabilities."""
|
|
127
131
|
bs = x[0].shape[0] # batch size
|
|
128
132
|
angle = torch.cat([self.cv4[i](x[i]).view(bs, self.ne, -1) for i in range(self.nl)], 2) # OBB theta logits
|
|
129
133
|
# NOTE: set `angle` as an attribute so that `decode_bboxes` could use it.
|
|
@@ -212,26 +216,28 @@ class RTDETRDecoder(nn.Module):
|
|
|
212
216
|
and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
|
|
213
217
|
Transformer decoder layers to output the final predictions.
|
|
214
218
|
"""
|
|
219
|
+
|
|
215
220
|
export = False # export mode
|
|
216
221
|
|
|
217
222
|
def __init__(
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
223
|
+
self,
|
|
224
|
+
nc=80,
|
|
225
|
+
ch=(512, 1024, 2048),
|
|
226
|
+
hd=256, # hidden dim
|
|
227
|
+
nq=300, # num queries
|
|
228
|
+
ndp=4, # num decoder points
|
|
229
|
+
nh=8, # num head
|
|
230
|
+
ndl=6, # num decoder layers
|
|
231
|
+
d_ffn=1024, # dim of feedforward
|
|
232
|
+
dropout=0.0,
|
|
233
|
+
act=nn.ReLU(),
|
|
234
|
+
eval_idx=-1,
|
|
235
|
+
# Training args
|
|
236
|
+
nd=100, # num denoising
|
|
237
|
+
label_noise_ratio=0.5,
|
|
238
|
+
box_noise_scale=1.0,
|
|
239
|
+
learnt_init_query=False,
|
|
240
|
+
):
|
|
235
241
|
"""
|
|
236
242
|
Initializes the RTDETRDecoder module with the given parameters.
|
|
237
243
|
|
|
@@ -300,28 +306,30 @@ class RTDETRDecoder(nn.Module):
|
|
|
300
306
|
feats, shapes = self._get_encoder_input(x)
|
|
301
307
|
|
|
302
308
|
# Prepare denoising training
|
|
303
|
-
dn_embed, dn_bbox, attn_mask, dn_meta =
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
309
|
+
dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
|
|
310
|
+
batch,
|
|
311
|
+
self.nc,
|
|
312
|
+
self.num_queries,
|
|
313
|
+
self.denoising_class_embed.weight,
|
|
314
|
+
self.num_denoising,
|
|
315
|
+
self.label_noise_ratio,
|
|
316
|
+
self.box_noise_scale,
|
|
317
|
+
self.training,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
|
|
315
321
|
|
|
316
322
|
# Decoder
|
|
317
|
-
dec_bboxes, dec_scores = self.decoder(
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
323
|
+
dec_bboxes, dec_scores = self.decoder(
|
|
324
|
+
embed,
|
|
325
|
+
refer_bbox,
|
|
326
|
+
feats,
|
|
327
|
+
shapes,
|
|
328
|
+
self.dec_bbox_head,
|
|
329
|
+
self.dec_score_head,
|
|
330
|
+
self.query_pos_head,
|
|
331
|
+
attn_mask=attn_mask,
|
|
332
|
+
)
|
|
325
333
|
x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
|
|
326
334
|
if self.training:
|
|
327
335
|
return x
|
|
@@ -329,24 +337,24 @@ class RTDETRDecoder(nn.Module):
|
|
|
329
337
|
y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
|
|
330
338
|
return y if self.export else (y, x)
|
|
331
339
|
|
|
332
|
-
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device=
|
|
340
|
+
def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
|
|
333
341
|
"""Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
|
|
334
342
|
anchors = []
|
|
335
343
|
for i, (h, w) in enumerate(shapes):
|
|
336
344
|
sy = torch.arange(end=h, dtype=dtype, device=device)
|
|
337
345
|
sx = torch.arange(end=w, dtype=dtype, device=device)
|
|
338
|
-
grid_y, grid_x = torch.meshgrid(sy, sx, indexing=
|
|
346
|
+
grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
|
|
339
347
|
grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
|
|
340
348
|
|
|
341
349
|
valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
|
|
342
350
|
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
|
|
343
|
-
wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0
|
|
351
|
+
wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
|
|
344
352
|
anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
|
|
345
353
|
|
|
346
354
|
anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
|
|
347
355
|
valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
|
|
348
356
|
anchors = torch.log(anchors / (1 - anchors))
|
|
349
|
-
anchors = anchors.masked_fill(~valid_mask, float(
|
|
357
|
+
anchors = anchors.masked_fill(~valid_mask, float("inf"))
|
|
350
358
|
return anchors, valid_mask
|
|
351
359
|
|
|
352
360
|
def _get_encoder_input(self, x):
|
|
@@ -413,13 +421,13 @@ class RTDETRDecoder(nn.Module):
|
|
|
413
421
|
# NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
|
|
414
422
|
# linear_init_(self.enc_score_head)
|
|
415
423
|
constant_(self.enc_score_head.bias, bias_cls)
|
|
416
|
-
constant_(self.enc_bbox_head.layers[-1].weight, 0.)
|
|
417
|
-
constant_(self.enc_bbox_head.layers[-1].bias, 0.)
|
|
424
|
+
constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
|
|
425
|
+
constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
|
|
418
426
|
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
|
419
427
|
# linear_init_(cls_)
|
|
420
428
|
constant_(cls_.bias, bias_cls)
|
|
421
|
-
constant_(reg_.layers[-1].weight, 0.)
|
|
422
|
-
constant_(reg_.layers[-1].bias, 0.)
|
|
429
|
+
constant_(reg_.layers[-1].weight, 0.0)
|
|
430
|
+
constant_(reg_.layers[-1].bias, 0.0)
|
|
423
431
|
|
|
424
432
|
linear_init_(self.enc_output[0])
|
|
425
433
|
xavier_uniform_(self.enc_output[0].weight)
|
|
@@ -11,8 +11,18 @@ from torch.nn.init import constant_, xavier_uniform_
|
|
|
11
11
|
from .conv import Conv
|
|
12
12
|
from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
|
|
13
13
|
|
|
14
|
-
__all__ = (
|
|
15
|
-
|
|
14
|
+
__all__ = (
|
|
15
|
+
"TransformerEncoderLayer",
|
|
16
|
+
"TransformerLayer",
|
|
17
|
+
"TransformerBlock",
|
|
18
|
+
"MLPBlock",
|
|
19
|
+
"LayerNorm2d",
|
|
20
|
+
"AIFI",
|
|
21
|
+
"DeformableTransformerDecoder",
|
|
22
|
+
"DeformableTransformerDecoderLayer",
|
|
23
|
+
"MSDeformAttn",
|
|
24
|
+
"MLP",
|
|
25
|
+
)
|
|
16
26
|
|
|
17
27
|
|
|
18
28
|
class TransformerEncoderLayer(nn.Module):
|
|
@@ -22,9 +32,11 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
22
32
|
"""Initialize the TransformerEncoderLayer with specified parameters."""
|
|
23
33
|
super().__init__()
|
|
24
34
|
from ...utils.torch_utils import TORCH_1_9
|
|
35
|
+
|
|
25
36
|
if not TORCH_1_9:
|
|
26
37
|
raise ModuleNotFoundError(
|
|
27
|
-
|
|
38
|
+
"TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
|
|
39
|
+
)
|
|
28
40
|
self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
|
|
29
41
|
# Implementation of Feedforward model
|
|
30
42
|
self.fc1 = nn.Linear(c1, cm)
|
|
@@ -91,12 +103,11 @@ class AIFI(TransformerEncoderLayer):
|
|
|
91
103
|
"""Builds 2D sine-cosine position embedding."""
|
|
92
104
|
grid_w = torch.arange(int(w), dtype=torch.float32)
|
|
93
105
|
grid_h = torch.arange(int(h), dtype=torch.float32)
|
|
94
|
-
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing=
|
|
95
|
-
assert embed_dim % 4 == 0,
|
|
96
|
-
'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
|
106
|
+
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
|
|
107
|
+
assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
|
|
97
108
|
pos_dim = embed_dim // 4
|
|
98
109
|
omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
|
|
99
|
-
omega = 1. / (temperature
|
|
110
|
+
omega = 1.0 / (temperature**omega)
|
|
100
111
|
|
|
101
112
|
out_w = grid_w.flatten()[..., None] @ omega[None]
|
|
102
113
|
out_h = grid_h.flatten()[..., None] @ omega[None]
|
|
@@ -213,10 +224,10 @@ class MSDeformAttn(nn.Module):
|
|
|
213
224
|
"""Initialize MSDeformAttn with the given parameters."""
|
|
214
225
|
super().__init__()
|
|
215
226
|
if d_model % n_heads != 0:
|
|
216
|
-
raise ValueError(f
|
|
227
|
+
raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
|
|
217
228
|
_d_per_head = d_model // n_heads
|
|
218
229
|
# Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
|
|
219
|
-
assert _d_per_head * n_heads == d_model,
|
|
230
|
+
assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"
|
|
220
231
|
|
|
221
232
|
self.im2col_step = 64
|
|
222
233
|
|
|
@@ -234,21 +245,24 @@ class MSDeformAttn(nn.Module):
|
|
|
234
245
|
|
|
235
246
|
def _reset_parameters(self):
|
|
236
247
|
"""Reset module parameters."""
|
|
237
|
-
constant_(self.sampling_offsets.weight.data, 0.)
|
|
248
|
+
constant_(self.sampling_offsets.weight.data, 0.0)
|
|
238
249
|
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
|
|
239
250
|
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
|
|
240
|
-
grid_init = (
|
|
241
|
-
|
|
251
|
+
grid_init = (
|
|
252
|
+
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
|
|
253
|
+
.view(self.n_heads, 1, 1, 2)
|
|
254
|
+
.repeat(1, self.n_levels, self.n_points, 1)
|
|
255
|
+
)
|
|
242
256
|
for i in range(self.n_points):
|
|
243
257
|
grid_init[:, :, i, :] *= i + 1
|
|
244
258
|
with torch.no_grad():
|
|
245
259
|
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
|
|
246
|
-
constant_(self.attention_weights.weight.data, 0.)
|
|
247
|
-
constant_(self.attention_weights.bias.data, 0.)
|
|
260
|
+
constant_(self.attention_weights.weight.data, 0.0)
|
|
261
|
+
constant_(self.attention_weights.bias.data, 0.0)
|
|
248
262
|
xavier_uniform_(self.value_proj.weight.data)
|
|
249
|
-
constant_(self.value_proj.bias.data, 0.)
|
|
263
|
+
constant_(self.value_proj.bias.data, 0.0)
|
|
250
264
|
xavier_uniform_(self.output_proj.weight.data)
|
|
251
|
-
constant_(self.output_proj.bias.data, 0.)
|
|
265
|
+
constant_(self.output_proj.bias.data, 0.0)
|
|
252
266
|
|
|
253
267
|
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
|
|
254
268
|
"""
|
|
@@ -288,7 +302,7 @@ class MSDeformAttn(nn.Module):
|
|
|
288
302
|
add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
|
|
289
303
|
sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
|
|
290
304
|
else:
|
|
291
|
-
raise ValueError(f
|
|
305
|
+
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
|
|
292
306
|
output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
|
|
293
307
|
return self.output_proj(output)
|
|
294
308
|
|
|
@@ -301,7 +315,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
301
315
|
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
|
|
302
316
|
"""
|
|
303
317
|
|
|
304
|
-
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0
|
|
318
|
+
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
|
|
305
319
|
"""Initialize the DeformableTransformerDecoderLayer with the given parameters."""
|
|
306
320
|
super().__init__()
|
|
307
321
|
|
|
@@ -339,14 +353,16 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
339
353
|
|
|
340
354
|
# Self attention
|
|
341
355
|
q = k = self.with_pos_embed(embed, query_pos)
|
|
342
|
-
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
|
|
343
|
-
|
|
356
|
+
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
|
|
357
|
+
0
|
|
358
|
+
].transpose(0, 1)
|
|
344
359
|
embed = embed + self.dropout1(tgt)
|
|
345
360
|
embed = self.norm1(embed)
|
|
346
361
|
|
|
347
362
|
# Cross attention
|
|
348
|
-
tgt = self.cross_attn(
|
|
349
|
-
|
|
363
|
+
tgt = self.cross_attn(
|
|
364
|
+
self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
|
|
365
|
+
)
|
|
350
366
|
embed = embed + self.dropout2(tgt)
|
|
351
367
|
embed = self.norm2(embed)
|
|
352
368
|
|
|
@@ -370,16 +386,17 @@ class DeformableTransformerDecoder(nn.Module):
|
|
|
370
386
|
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
|
371
387
|
|
|
372
388
|
def forward(
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
389
|
+
self,
|
|
390
|
+
embed, # decoder embeddings
|
|
391
|
+
refer_bbox, # anchor
|
|
392
|
+
feats, # image features
|
|
393
|
+
shapes, # feature shapes
|
|
394
|
+
bbox_head,
|
|
395
|
+
score_head,
|
|
396
|
+
pos_mlp,
|
|
397
|
+
attn_mask=None,
|
|
398
|
+
padding_mask=None,
|
|
399
|
+
):
|
|
383
400
|
"""Perform the forward pass through the entire decoder."""
|
|
384
401
|
output = embed
|
|
385
402
|
dec_bboxes = []
|
ultralytics/nn/modules/utils.py
CHANGED
|
@@ -10,7 +10,7 @@ import torch.nn as nn
|
|
|
10
10
|
import torch.nn.functional as F
|
|
11
11
|
from torch.nn.init import uniform_
|
|
12
12
|
|
|
13
|
-
__all__ =
|
|
13
|
+
__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def _get_clones(module, n):
|
|
@@ -27,7 +27,7 @@ def linear_init_(module):
|
|
|
27
27
|
"""Initialize the weights and biases of a linear module."""
|
|
28
28
|
bound = 1 / math.sqrt(module.weight.shape[0])
|
|
29
29
|
uniform_(module.weight, -bound, bound)
|
|
30
|
-
if hasattr(module,
|
|
30
|
+
if hasattr(module, "bias") and module.bias is not None:
|
|
31
31
|
uniform_(module.bias, -bound, bound)
|
|
32
32
|
|
|
33
33
|
|
|
@@ -39,9 +39,12 @@ def inverse_sigmoid(x, eps=1e-5):
|
|
|
39
39
|
return torch.log(x1 / x2)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def multi_scale_deformable_attn_pytorch(
|
|
43
|
-
|
|
44
|
-
|
|
42
|
+
def multi_scale_deformable_attn_pytorch(
|
|
43
|
+
value: torch.Tensor,
|
|
44
|
+
value_spatial_shapes: torch.Tensor,
|
|
45
|
+
sampling_locations: torch.Tensor,
|
|
46
|
+
attention_weights: torch.Tensor,
|
|
47
|
+
) -> torch.Tensor:
|
|
45
48
|
"""
|
|
46
49
|
Multi-scale deformable attention.
|
|
47
50
|
|
|
@@ -58,23 +61,25 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
|
|
|
58
61
|
# bs, H_*W_, num_heads*embed_dims ->
|
|
59
62
|
# bs, num_heads*embed_dims, H_*W_ ->
|
|
60
63
|
# bs*num_heads, embed_dims, H_, W_
|
|
61
|
-
value_l_ =
|
|
64
|
+
value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
|
|
62
65
|
# bs, num_queries, num_heads, num_points, 2 ->
|
|
63
66
|
# bs, num_heads, num_queries, num_points, 2 ->
|
|
64
67
|
# bs*num_heads, num_queries, num_points, 2
|
|
65
68
|
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
|
|
66
69
|
# bs*num_heads, embed_dims, num_queries, num_points
|
|
67
|
-
sampling_value_l_ = F.grid_sample(
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
padding_mode='zeros',
|
|
71
|
-
align_corners=False)
|
|
70
|
+
sampling_value_l_ = F.grid_sample(
|
|
71
|
+
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
|
|
72
|
+
)
|
|
72
73
|
sampling_value_list.append(sampling_value_l_)
|
|
73
74
|
# (bs, num_queries, num_heads, num_levels, num_points) ->
|
|
74
75
|
# (bs, num_heads, num_queries, num_levels, num_points) ->
|
|
75
76
|
# (bs, num_heads, 1, num_queries, num_levels*num_points)
|
|
76
|
-
attention_weights = attention_weights.transpose(1, 2).reshape(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
attention_weights = attention_weights.transpose(1, 2).reshape(
|
|
78
|
+
bs * num_heads, 1, num_queries, num_levels * num_points
|
|
79
|
+
)
|
|
80
|
+
output = (
|
|
81
|
+
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
|
|
82
|
+
.sum(-1)
|
|
83
|
+
.view(bs, num_heads * embed_dims, num_queries)
|
|
84
|
+
)
|
|
80
85
|
return output.transpose(1, 2).contiguous()
|