ultralytics 8.0.237__py3-none-any.whl → 8.0.239__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultralytics might be problematic. Click here for more details.

Files changed (137) hide show
  1. ultralytics/__init__.py +2 -2
  2. ultralytics/cfg/__init__.py +241 -138
  3. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  4. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  5. ultralytics/cfg/datasets/dota8.yaml +34 -0
  6. ultralytics/data/__init__.py +9 -2
  7. ultralytics/data/annotator.py +4 -4
  8. ultralytics/data/augment.py +186 -169
  9. ultralytics/data/base.py +54 -48
  10. ultralytics/data/build.py +34 -23
  11. ultralytics/data/converter.py +242 -70
  12. ultralytics/data/dataset.py +117 -95
  13. ultralytics/data/explorer/__init__.py +5 -0
  14. ultralytics/data/explorer/explorer.py +170 -97
  15. ultralytics/data/explorer/gui/__init__.py +1 -0
  16. ultralytics/data/explorer/gui/dash.py +146 -76
  17. ultralytics/data/explorer/utils.py +87 -25
  18. ultralytics/data/loaders.py +75 -62
  19. ultralytics/data/split_dota.py +44 -36
  20. ultralytics/data/utils.py +160 -142
  21. ultralytics/engine/exporter.py +348 -292
  22. ultralytics/engine/model.py +102 -66
  23. ultralytics/engine/predictor.py +74 -55
  24. ultralytics/engine/results.py +63 -40
  25. ultralytics/engine/trainer.py +192 -144
  26. ultralytics/engine/tuner.py +66 -59
  27. ultralytics/engine/validator.py +31 -26
  28. ultralytics/hub/__init__.py +54 -31
  29. ultralytics/hub/auth.py +28 -25
  30. ultralytics/hub/session.py +282 -133
  31. ultralytics/hub/utils.py +64 -42
  32. ultralytics/models/__init__.py +1 -1
  33. ultralytics/models/fastsam/__init__.py +1 -1
  34. ultralytics/models/fastsam/model.py +6 -6
  35. ultralytics/models/fastsam/predict.py +3 -2
  36. ultralytics/models/fastsam/prompt.py +55 -48
  37. ultralytics/models/fastsam/val.py +1 -1
  38. ultralytics/models/nas/__init__.py +1 -1
  39. ultralytics/models/nas/model.py +9 -8
  40. ultralytics/models/nas/predict.py +8 -6
  41. ultralytics/models/nas/val.py +11 -9
  42. ultralytics/models/rtdetr/__init__.py +1 -1
  43. ultralytics/models/rtdetr/model.py +11 -9
  44. ultralytics/models/rtdetr/train.py +18 -16
  45. ultralytics/models/rtdetr/val.py +25 -19
  46. ultralytics/models/sam/__init__.py +1 -1
  47. ultralytics/models/sam/amg.py +13 -14
  48. ultralytics/models/sam/build.py +44 -42
  49. ultralytics/models/sam/model.py +6 -6
  50. ultralytics/models/sam/modules/decoders.py +6 -4
  51. ultralytics/models/sam/modules/encoders.py +37 -35
  52. ultralytics/models/sam/modules/sam.py +5 -4
  53. ultralytics/models/sam/modules/tiny_encoder.py +95 -73
  54. ultralytics/models/sam/modules/transformer.py +3 -2
  55. ultralytics/models/sam/predict.py +39 -27
  56. ultralytics/models/utils/loss.py +99 -95
  57. ultralytics/models/utils/ops.py +34 -31
  58. ultralytics/models/yolo/__init__.py +1 -1
  59. ultralytics/models/yolo/classify/__init__.py +1 -1
  60. ultralytics/models/yolo/classify/predict.py +8 -6
  61. ultralytics/models/yolo/classify/train.py +37 -31
  62. ultralytics/models/yolo/classify/val.py +26 -24
  63. ultralytics/models/yolo/detect/__init__.py +1 -1
  64. ultralytics/models/yolo/detect/predict.py +8 -6
  65. ultralytics/models/yolo/detect/train.py +47 -37
  66. ultralytics/models/yolo/detect/val.py +100 -82
  67. ultralytics/models/yolo/model.py +31 -25
  68. ultralytics/models/yolo/obb/__init__.py +1 -1
  69. ultralytics/models/yolo/obb/predict.py +13 -12
  70. ultralytics/models/yolo/obb/train.py +3 -3
  71. ultralytics/models/yolo/obb/val.py +80 -58
  72. ultralytics/models/yolo/pose/__init__.py +1 -1
  73. ultralytics/models/yolo/pose/predict.py +17 -12
  74. ultralytics/models/yolo/pose/train.py +28 -25
  75. ultralytics/models/yolo/pose/val.py +91 -64
  76. ultralytics/models/yolo/segment/__init__.py +1 -1
  77. ultralytics/models/yolo/segment/predict.py +10 -8
  78. ultralytics/models/yolo/segment/train.py +16 -15
  79. ultralytics/models/yolo/segment/val.py +90 -68
  80. ultralytics/nn/__init__.py +26 -6
  81. ultralytics/nn/autobackend.py +144 -112
  82. ultralytics/nn/modules/__init__.py +96 -13
  83. ultralytics/nn/modules/block.py +28 -7
  84. ultralytics/nn/modules/conv.py +41 -23
  85. ultralytics/nn/modules/head.py +67 -59
  86. ultralytics/nn/modules/transformer.py +49 -32
  87. ultralytics/nn/modules/utils.py +20 -15
  88. ultralytics/nn/tasks.py +215 -141
  89. ultralytics/solutions/ai_gym.py +59 -47
  90. ultralytics/solutions/distance_calculation.py +22 -15
  91. ultralytics/solutions/heatmap.py +76 -54
  92. ultralytics/solutions/object_counter.py +46 -39
  93. ultralytics/solutions/speed_estimation.py +13 -16
  94. ultralytics/trackers/__init__.py +1 -1
  95. ultralytics/trackers/basetrack.py +1 -0
  96. ultralytics/trackers/bot_sort.py +2 -1
  97. ultralytics/trackers/byte_tracker.py +10 -7
  98. ultralytics/trackers/track.py +7 -7
  99. ultralytics/trackers/utils/gmc.py +25 -25
  100. ultralytics/trackers/utils/kalman_filter.py +85 -42
  101. ultralytics/trackers/utils/matching.py +8 -7
  102. ultralytics/utils/__init__.py +173 -151
  103. ultralytics/utils/autobatch.py +10 -10
  104. ultralytics/utils/benchmarks.py +76 -86
  105. ultralytics/utils/callbacks/__init__.py +1 -1
  106. ultralytics/utils/callbacks/base.py +29 -29
  107. ultralytics/utils/callbacks/clearml.py +51 -43
  108. ultralytics/utils/callbacks/comet.py +81 -66
  109. ultralytics/utils/callbacks/dvc.py +33 -26
  110. ultralytics/utils/callbacks/hub.py +44 -26
  111. ultralytics/utils/callbacks/mlflow.py +31 -24
  112. ultralytics/utils/callbacks/neptune.py +35 -25
  113. ultralytics/utils/callbacks/raytune.py +9 -4
  114. ultralytics/utils/callbacks/tensorboard.py +16 -11
  115. ultralytics/utils/callbacks/wb.py +39 -33
  116. ultralytics/utils/checks.py +189 -141
  117. ultralytics/utils/dist.py +15 -12
  118. ultralytics/utils/downloads.py +112 -96
  119. ultralytics/utils/errors.py +1 -1
  120. ultralytics/utils/files.py +11 -11
  121. ultralytics/utils/instance.py +22 -22
  122. ultralytics/utils/loss.py +117 -67
  123. ultralytics/utils/metrics.py +224 -158
  124. ultralytics/utils/ops.py +39 -29
  125. ultralytics/utils/patches.py +3 -3
  126. ultralytics/utils/plotting.py +217 -120
  127. ultralytics/utils/tal.py +19 -13
  128. ultralytics/utils/torch_utils.py +138 -109
  129. ultralytics/utils/triton.py +12 -10
  130. ultralytics/utils/tuner.py +49 -47
  131. {ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/METADATA +5 -4
  132. ultralytics-8.0.239.dist-info/RECORD +188 -0
  133. ultralytics-8.0.237.dist-info/RECORD +0 -187
  134. {ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/LICENSE +0 -0
  135. {ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/WHEEL +0 -0
  136. {ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/entry_points.txt +0 -0
  137. {ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/top_level.txt +0 -0
@@ -8,8 +8,26 @@ import torch.nn.functional as F
8
8
  from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
9
9
  from .transformer import TransformerBlock
10
10
 
11
- __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
12
- 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3', 'ResNetLayer')
11
+ __all__ = (
12
+ "DFL",
13
+ "HGBlock",
14
+ "HGStem",
15
+ "SPP",
16
+ "SPPF",
17
+ "C1",
18
+ "C2",
19
+ "C3",
20
+ "C2f",
21
+ "C3x",
22
+ "C3TR",
23
+ "C3Ghost",
24
+ "GhostBottleneck",
25
+ "Bottleneck",
26
+ "BottleneckCSP",
27
+ "Proto",
28
+ "RepC3",
29
+ "ResNetLayer",
30
+ )
13
31
 
14
32
 
15
33
  class DFL(nn.Module):
@@ -284,9 +302,11 @@ class GhostBottleneck(nn.Module):
284
302
  self.conv = nn.Sequential(
285
303
  GhostConv(c1, c_, 1, 1), # pw
286
304
  DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
287
- GhostConv(c_, c2, 1, 1, act=False)) # pw-linear
288
- self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
289
- act=False)) if s == 2 else nn.Identity()
305
+ GhostConv(c_, c2, 1, 1, act=False), # pw-linear
306
+ )
307
+ self.shortcut = (
308
+ nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
309
+ )
290
310
 
291
311
  def forward(self, x):
292
312
  """Applies skip connection and concatenation to input tensor."""
@@ -359,8 +379,9 @@ class ResNetLayer(nn.Module):
359
379
  self.is_first = is_first
360
380
 
361
381
  if self.is_first:
362
- self.layer = nn.Sequential(Conv(c1, c2, k=7, s=2, p=3, act=True),
363
- nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
382
+ self.layer = nn.Sequential(
383
+ Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
384
+ )
364
385
  else:
365
386
  blocks = [ResNetBlock(c1, c2, s, e=e)]
366
387
  blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
@@ -7,8 +7,21 @@ import numpy as np
7
7
  import torch
8
8
  import torch.nn as nn
9
9
 
10
- __all__ = ('Conv', 'Conv2', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
11
- 'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
10
+ __all__ = (
11
+ "Conv",
12
+ "Conv2",
13
+ "LightConv",
14
+ "DWConv",
15
+ "DWConvTranspose2d",
16
+ "ConvTranspose",
17
+ "Focus",
18
+ "GhostConv",
19
+ "ChannelAttention",
20
+ "SpatialAttention",
21
+ "CBAM",
22
+ "Concat",
23
+ "RepConv",
24
+ )
12
25
 
13
26
 
14
27
  def autopad(k, p=None, d=1): # kernel, padding, dilation
@@ -22,6 +35,7 @@ def autopad(k, p=None, d=1): # kernel, padding, dilation
22
35
 
23
36
  class Conv(nn.Module):
24
37
  """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
38
+
25
39
  default_act = nn.SiLU() # default activation
26
40
 
27
41
  def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
@@ -60,9 +74,9 @@ class Conv2(Conv):
60
74
  """Fuse parallel convolutions."""
61
75
  w = torch.zeros_like(self.conv.weight.data)
62
76
  i = [x // 2 for x in w.shape[2:]]
63
- w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
77
+ w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
64
78
  self.conv.weight.data += w
65
- self.__delattr__('cv2')
79
+ self.__delattr__("cv2")
66
80
  self.forward = self.forward_fuse
67
81
 
68
82
 
@@ -102,6 +116,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d):
102
116
 
103
117
  class ConvTranspose(nn.Module):
104
118
  """Convolution transpose 2d layer."""
119
+
105
120
  default_act = nn.SiLU() # default activation
106
121
 
107
122
  def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
@@ -164,6 +179,7 @@ class RepConv(nn.Module):
164
179
  This module is used in RT-DETR.
165
180
  Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
166
181
  """
182
+
167
183
  default_act = nn.SiLU() # default activation
168
184
 
169
185
  def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
@@ -214,7 +230,7 @@ class RepConv(nn.Module):
214
230
  beta = branch.bn.bias
215
231
  eps = branch.bn.eps
216
232
  elif isinstance(branch, nn.BatchNorm2d):
217
- if not hasattr(self, 'id_tensor'):
233
+ if not hasattr(self, "id_tensor"):
218
234
  input_dim = self.c1 // self.g
219
235
  kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
220
236
  for i in range(self.c1):
@@ -232,29 +248,31 @@ class RepConv(nn.Module):
232
248
 
233
249
  def fuse_convs(self):
234
250
  """Combines two convolution layers into a single layer and removes unused attributes from the class."""
235
- if hasattr(self, 'conv'):
251
+ if hasattr(self, "conv"):
236
252
  return
237
253
  kernel, bias = self.get_equivalent_kernel_bias()
238
- self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
239
- out_channels=self.conv1.conv.out_channels,
240
- kernel_size=self.conv1.conv.kernel_size,
241
- stride=self.conv1.conv.stride,
242
- padding=self.conv1.conv.padding,
243
- dilation=self.conv1.conv.dilation,
244
- groups=self.conv1.conv.groups,
245
- bias=True).requires_grad_(False)
254
+ self.conv = nn.Conv2d(
255
+ in_channels=self.conv1.conv.in_channels,
256
+ out_channels=self.conv1.conv.out_channels,
257
+ kernel_size=self.conv1.conv.kernel_size,
258
+ stride=self.conv1.conv.stride,
259
+ padding=self.conv1.conv.padding,
260
+ dilation=self.conv1.conv.dilation,
261
+ groups=self.conv1.conv.groups,
262
+ bias=True,
263
+ ).requires_grad_(False)
246
264
  self.conv.weight.data = kernel
247
265
  self.conv.bias.data = bias
248
266
  for para in self.parameters():
249
267
  para.detach_()
250
- self.__delattr__('conv1')
251
- self.__delattr__('conv2')
252
- if hasattr(self, 'nm'):
253
- self.__delattr__('nm')
254
- if hasattr(self, 'bn'):
255
- self.__delattr__('bn')
256
- if hasattr(self, 'id_tensor'):
257
- self.__delattr__('id_tensor')
268
+ self.__delattr__("conv1")
269
+ self.__delattr__("conv2")
270
+ if hasattr(self, "nm"):
271
+ self.__delattr__("nm")
272
+ if hasattr(self, "bn"):
273
+ self.__delattr__("bn")
274
+ if hasattr(self, "id_tensor"):
275
+ self.__delattr__("id_tensor")
258
276
 
259
277
 
260
278
  class ChannelAttention(nn.Module):
@@ -278,7 +296,7 @@ class SpatialAttention(nn.Module):
278
296
  def __init__(self, kernel_size=7):
279
297
  """Initialize Spatial-attention module with kernel size argument."""
280
298
  super().__init__()
281
- assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
299
+ assert kernel_size in (3, 7), "kernel size must be 3 or 7"
282
300
  padding = 3 if kernel_size == 7 else 1
283
301
  self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
284
302
  self.act = nn.Sigmoid()
@@ -14,11 +14,12 @@ from .conv import Conv
14
14
  from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
15
15
  from .utils import bias_init_with_prob, linear_init_
16
16
 
17
- __all__ = 'Detect', 'Segment', 'Pose', 'Classify', 'OBB', 'RTDETRDecoder'
17
+ __all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
18
18
 
19
19
 
20
20
  class Detect(nn.Module):
21
21
  """YOLOv8 Detect head for detection models."""
22
+
22
23
  dynamic = False # force grid reconstruction
23
24
  export = False # export mode
24
25
  shape = None
@@ -35,7 +36,8 @@ class Detect(nn.Module):
35
36
  self.stride = torch.zeros(self.nl) # strides computed during build
36
37
  c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
37
38
  self.cv2 = nn.ModuleList(
38
- nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
39
+ nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
40
+ )
39
41
  self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
40
42
  self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
41
43
 
@@ -53,21 +55,21 @@ class Detect(nn.Module):
53
55
  self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
54
56
  self.shape = shape
55
57
 
56
- if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'): # avoid TF FlexSplitV ops
57
- box = x_cat[:, :self.reg_max * 4]
58
- cls = x_cat[:, self.reg_max * 4:]
58
+ if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"): # avoid TF FlexSplitV ops
59
+ box = x_cat[:, : self.reg_max * 4]
60
+ cls = x_cat[:, self.reg_max * 4 :]
59
61
  else:
60
62
  box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
61
63
  dbox = self.decode_bboxes(box)
62
64
 
63
- if self.export and self.format in ('tflite', 'edgetpu'):
64
- # Normalize xywh with image size to mitigate quantization error of TFLite integer models as done in YOLOv5:
65
- # https://github.com/ultralytics/yolov5/blob/0c8de3fca4a702f8ff5c435e67f378d1fce70243/models/tf.py#L307-L309
66
- # See this PR for details: https://github.com/ultralytics/ultralytics/pull/1695
67
- img_h = shape[2] * self.stride[0]
68
- img_w = shape[3] * self.stride[0]
69
- img_size = torch.tensor([img_w, img_h, img_w, img_h], device=dbox.device).reshape(1, 4, 1)
70
- dbox /= img_size
65
+ if self.export and self.format in ("tflite", "edgetpu"):
66
+ # Precompute normalization factor to increase numerical stability
67
+ # See https://github.com/ultralytics/ultralytics/issues/7371
68
+ img_h = shape[2]
69
+ img_w = shape[3]
70
+ img_size = torch.tensor([img_w, img_h, img_w, img_h], device=box.device).reshape(1, 4, 1)
71
+ norm = self.strides / (self.stride[0] * img_size)
72
+ dbox = dist2bbox(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2], xywh=True, dim=1)
71
73
 
72
74
  y = torch.cat((dbox, cls.sigmoid()), 1)
73
75
  return y if self.export else (y, x)
@@ -79,7 +81,7 @@ class Detect(nn.Module):
79
81
  # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
80
82
  for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
81
83
  a[-1].bias.data[:] = 1.0 # box
82
- b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
84
+ b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
83
85
 
84
86
  def decode_bboxes(self, bboxes):
85
87
  """Decode bounding boxes."""
@@ -116,6 +118,7 @@ class OBB(Detect):
116
118
  """YOLOv8 OBB detection head for detection with rotation models."""
117
119
 
118
120
  def __init__(self, nc=80, ne=1, ch=()):
121
+ """Initialize OBB with number of classes `nc` and layer channels `ch`."""
119
122
  super().__init__(nc, ch)
120
123
  self.ne = ne # number of extra parameters
121
124
  self.detect = Detect.forward
@@ -124,6 +127,7 @@ class OBB(Detect):
124
127
  self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.ne, 1)) for x in ch)
125
128
 
126
129
  def forward(self, x):
130
+ """Concatenates and returns predicted bounding boxes and class probabilities."""
127
131
  bs = x[0].shape[0] # batch size
128
132
  angle = torch.cat([self.cv4[i](x[i]).view(bs, self.ne, -1) for i in range(self.nl)], 2) # OBB theta logits
129
133
  # NOTE: set `angle` as an attribute so that `decode_bboxes` could use it.
@@ -212,26 +216,28 @@ class RTDETRDecoder(nn.Module):
212
216
  and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
213
217
  Transformer decoder layers to output the final predictions.
214
218
  """
219
+
215
220
  export = False # export mode
216
221
 
217
222
  def __init__(
218
- self,
219
- nc=80,
220
- ch=(512, 1024, 2048),
221
- hd=256, # hidden dim
222
- nq=300, # num queries
223
- ndp=4, # num decoder points
224
- nh=8, # num head
225
- ndl=6, # num decoder layers
226
- d_ffn=1024, # dim of feedforward
227
- dropout=0.,
228
- act=nn.ReLU(),
229
- eval_idx=-1,
230
- # Training args
231
- nd=100, # num denoising
232
- label_noise_ratio=0.5,
233
- box_noise_scale=1.0,
234
- learnt_init_query=False):
223
+ self,
224
+ nc=80,
225
+ ch=(512, 1024, 2048),
226
+ hd=256, # hidden dim
227
+ nq=300, # num queries
228
+ ndp=4, # num decoder points
229
+ nh=8, # num head
230
+ ndl=6, # num decoder layers
231
+ d_ffn=1024, # dim of feedforward
232
+ dropout=0.0,
233
+ act=nn.ReLU(),
234
+ eval_idx=-1,
235
+ # Training args
236
+ nd=100, # num denoising
237
+ label_noise_ratio=0.5,
238
+ box_noise_scale=1.0,
239
+ learnt_init_query=False,
240
+ ):
235
241
  """
236
242
  Initializes the RTDETRDecoder module with the given parameters.
237
243
 
@@ -300,28 +306,30 @@ class RTDETRDecoder(nn.Module):
300
306
  feats, shapes = self._get_encoder_input(x)
301
307
 
302
308
  # Prepare denoising training
303
- dn_embed, dn_bbox, attn_mask, dn_meta = \
304
- get_cdn_group(batch,
305
- self.nc,
306
- self.num_queries,
307
- self.denoising_class_embed.weight,
308
- self.num_denoising,
309
- self.label_noise_ratio,
310
- self.box_noise_scale,
311
- self.training)
312
-
313
- embed, refer_bbox, enc_bboxes, enc_scores = \
314
- self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
309
+ dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
310
+ batch,
311
+ self.nc,
312
+ self.num_queries,
313
+ self.denoising_class_embed.weight,
314
+ self.num_denoising,
315
+ self.label_noise_ratio,
316
+ self.box_noise_scale,
317
+ self.training,
318
+ )
319
+
320
+ embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
315
321
 
316
322
  # Decoder
317
- dec_bboxes, dec_scores = self.decoder(embed,
318
- refer_bbox,
319
- feats,
320
- shapes,
321
- self.dec_bbox_head,
322
- self.dec_score_head,
323
- self.query_pos_head,
324
- attn_mask=attn_mask)
323
+ dec_bboxes, dec_scores = self.decoder(
324
+ embed,
325
+ refer_bbox,
326
+ feats,
327
+ shapes,
328
+ self.dec_bbox_head,
329
+ self.dec_score_head,
330
+ self.query_pos_head,
331
+ attn_mask=attn_mask,
332
+ )
325
333
  x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
326
334
  if self.training:
327
335
  return x
@@ -329,24 +337,24 @@ class RTDETRDecoder(nn.Module):
329
337
  y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
330
338
  return y if self.export else (y, x)
331
339
 
332
- def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
340
+ def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
333
341
  """Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
334
342
  anchors = []
335
343
  for i, (h, w) in enumerate(shapes):
336
344
  sy = torch.arange(end=h, dtype=dtype, device=device)
337
345
  sx = torch.arange(end=w, dtype=dtype, device=device)
338
- grid_y, grid_x = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx)
346
+ grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
339
347
  grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2)
340
348
 
341
349
  valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
342
350
  grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH # (1, h, w, 2)
343
- wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
351
+ wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
344
352
  anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4)) # (1, h*w, 4)
345
353
 
346
354
  anchors = torch.cat(anchors, 1) # (1, h*w*nl, 4)
347
355
  valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) # 1, h*w*nl, 1
348
356
  anchors = torch.log(anchors / (1 - anchors))
349
- anchors = anchors.masked_fill(~valid_mask, float('inf'))
357
+ anchors = anchors.masked_fill(~valid_mask, float("inf"))
350
358
  return anchors, valid_mask
351
359
 
352
360
  def _get_encoder_input(self, x):
@@ -413,13 +421,13 @@ class RTDETRDecoder(nn.Module):
413
421
  # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
414
422
  # linear_init_(self.enc_score_head)
415
423
  constant_(self.enc_score_head.bias, bias_cls)
416
- constant_(self.enc_bbox_head.layers[-1].weight, 0.)
417
- constant_(self.enc_bbox_head.layers[-1].bias, 0.)
424
+ constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
425
+ constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
418
426
  for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
419
427
  # linear_init_(cls_)
420
428
  constant_(cls_.bias, bias_cls)
421
- constant_(reg_.layers[-1].weight, 0.)
422
- constant_(reg_.layers[-1].bias, 0.)
429
+ constant_(reg_.layers[-1].weight, 0.0)
430
+ constant_(reg_.layers[-1].bias, 0.0)
423
431
 
424
432
  linear_init_(self.enc_output[0])
425
433
  xavier_uniform_(self.enc_output[0].weight)
@@ -11,8 +11,18 @@ from torch.nn.init import constant_, xavier_uniform_
11
11
  from .conv import Conv
12
12
  from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
13
13
 
14
- __all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI',
15
- 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP')
14
+ __all__ = (
15
+ "TransformerEncoderLayer",
16
+ "TransformerLayer",
17
+ "TransformerBlock",
18
+ "MLPBlock",
19
+ "LayerNorm2d",
20
+ "AIFI",
21
+ "DeformableTransformerDecoder",
22
+ "DeformableTransformerDecoderLayer",
23
+ "MSDeformAttn",
24
+ "MLP",
25
+ )
16
26
 
17
27
 
18
28
  class TransformerEncoderLayer(nn.Module):
@@ -22,9 +32,11 @@ class TransformerEncoderLayer(nn.Module):
22
32
  """Initialize the TransformerEncoderLayer with specified parameters."""
23
33
  super().__init__()
24
34
  from ...utils.torch_utils import TORCH_1_9
35
+
25
36
  if not TORCH_1_9:
26
37
  raise ModuleNotFoundError(
27
- 'TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).')
38
+ "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
39
+ )
28
40
  self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
29
41
  # Implementation of Feedforward model
30
42
  self.fc1 = nn.Linear(c1, cm)
@@ -91,12 +103,11 @@ class AIFI(TransformerEncoderLayer):
91
103
  """Builds 2D sine-cosine position embedding."""
92
104
  grid_w = torch.arange(int(w), dtype=torch.float32)
93
105
  grid_h = torch.arange(int(h), dtype=torch.float32)
94
- grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
95
- assert embed_dim % 4 == 0, \
96
- 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
106
+ grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
107
+ assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
97
108
  pos_dim = embed_dim // 4
98
109
  omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
99
- omega = 1. / (temperature ** omega)
110
+ omega = 1.0 / (temperature**omega)
100
111
 
101
112
  out_w = grid_w.flatten()[..., None] @ omega[None]
102
113
  out_h = grid_h.flatten()[..., None] @ omega[None]
@@ -213,10 +224,10 @@ class MSDeformAttn(nn.Module):
213
224
  """Initialize MSDeformAttn with the given parameters."""
214
225
  super().__init__()
215
226
  if d_model % n_heads != 0:
216
- raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
227
+ raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
217
228
  _d_per_head = d_model // n_heads
218
229
  # Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
219
- assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`'
230
+ assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"
220
231
 
221
232
  self.im2col_step = 64
222
233
 
@@ -234,21 +245,24 @@ class MSDeformAttn(nn.Module):
234
245
 
235
246
  def _reset_parameters(self):
236
247
  """Reset module parameters."""
237
- constant_(self.sampling_offsets.weight.data, 0.)
248
+ constant_(self.sampling_offsets.weight.data, 0.0)
238
249
  thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
239
250
  grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
240
- grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(
241
- 1, self.n_levels, self.n_points, 1)
251
+ grid_init = (
252
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
253
+ .view(self.n_heads, 1, 1, 2)
254
+ .repeat(1, self.n_levels, self.n_points, 1)
255
+ )
242
256
  for i in range(self.n_points):
243
257
  grid_init[:, :, i, :] *= i + 1
244
258
  with torch.no_grad():
245
259
  self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
246
- constant_(self.attention_weights.weight.data, 0.)
247
- constant_(self.attention_weights.bias.data, 0.)
260
+ constant_(self.attention_weights.weight.data, 0.0)
261
+ constant_(self.attention_weights.bias.data, 0.0)
248
262
  xavier_uniform_(self.value_proj.weight.data)
249
- constant_(self.value_proj.bias.data, 0.)
263
+ constant_(self.value_proj.bias.data, 0.0)
250
264
  xavier_uniform_(self.output_proj.weight.data)
251
- constant_(self.output_proj.bias.data, 0.)
265
+ constant_(self.output_proj.bias.data, 0.0)
252
266
 
253
267
  def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
254
268
  """
@@ -288,7 +302,7 @@ class MSDeformAttn(nn.Module):
288
302
  add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
289
303
  sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
290
304
  else:
291
- raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
305
+ raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
292
306
  output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
293
307
  return self.output_proj(output)
294
308
 
@@ -301,7 +315,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
301
315
  https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
302
316
  """
303
317
 
304
- def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
318
+ def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
305
319
  """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
306
320
  super().__init__()
307
321
 
@@ -339,14 +353,16 @@ class DeformableTransformerDecoderLayer(nn.Module):
339
353
 
340
354
  # Self attention
341
355
  q = k = self.with_pos_embed(embed, query_pos)
342
- tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
343
- attn_mask=attn_mask)[0].transpose(0, 1)
356
+ tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
357
+ 0
358
+ ].transpose(0, 1)
344
359
  embed = embed + self.dropout1(tgt)
345
360
  embed = self.norm1(embed)
346
361
 
347
362
  # Cross attention
348
- tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
349
- padding_mask)
363
+ tgt = self.cross_attn(
364
+ self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
365
+ )
350
366
  embed = embed + self.dropout2(tgt)
351
367
  embed = self.norm2(embed)
352
368
 
@@ -370,16 +386,17 @@ class DeformableTransformerDecoder(nn.Module):
370
386
  self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
371
387
 
372
388
  def forward(
373
- self,
374
- embed, # decoder embeddings
375
- refer_bbox, # anchor
376
- feats, # image features
377
- shapes, # feature shapes
378
- bbox_head,
379
- score_head,
380
- pos_mlp,
381
- attn_mask=None,
382
- padding_mask=None):
389
+ self,
390
+ embed, # decoder embeddings
391
+ refer_bbox, # anchor
392
+ feats, # image features
393
+ shapes, # feature shapes
394
+ bbox_head,
395
+ score_head,
396
+ pos_mlp,
397
+ attn_mask=None,
398
+ padding_mask=None,
399
+ ):
383
400
  """Perform the forward pass through the entire decoder."""
384
401
  output = embed
385
402
  dec_bboxes = []
@@ -10,7 +10,7 @@ import torch.nn as nn
10
10
  import torch.nn.functional as F
11
11
  from torch.nn.init import uniform_
12
12
 
13
- __all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
13
+ __all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
14
14
 
15
15
 
16
16
  def _get_clones(module, n):
@@ -27,7 +27,7 @@ def linear_init_(module):
27
27
  """Initialize the weights and biases of a linear module."""
28
28
  bound = 1 / math.sqrt(module.weight.shape[0])
29
29
  uniform_(module.weight, -bound, bound)
30
- if hasattr(module, 'bias') and module.bias is not None:
30
+ if hasattr(module, "bias") and module.bias is not None:
31
31
  uniform_(module.bias, -bound, bound)
32
32
 
33
33
 
@@ -39,9 +39,12 @@ def inverse_sigmoid(x, eps=1e-5):
39
39
  return torch.log(x1 / x2)
40
40
 
41
41
 
42
- def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
43
- sampling_locations: torch.Tensor,
44
- attention_weights: torch.Tensor) -> torch.Tensor:
42
+ def multi_scale_deformable_attn_pytorch(
43
+ value: torch.Tensor,
44
+ value_spatial_shapes: torch.Tensor,
45
+ sampling_locations: torch.Tensor,
46
+ attention_weights: torch.Tensor,
47
+ ) -> torch.Tensor:
45
48
  """
46
49
  Multi-scale deformable attention.
47
50
 
@@ -58,23 +61,25 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
58
61
  # bs, H_*W_, num_heads*embed_dims ->
59
62
  # bs, num_heads*embed_dims, H_*W_ ->
60
63
  # bs*num_heads, embed_dims, H_, W_
61
- value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
64
+ value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
62
65
  # bs, num_queries, num_heads, num_points, 2 ->
63
66
  # bs, num_heads, num_queries, num_points, 2 ->
64
67
  # bs*num_heads, num_queries, num_points, 2
65
68
  sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
66
69
  # bs*num_heads, embed_dims, num_queries, num_points
67
- sampling_value_l_ = F.grid_sample(value_l_,
68
- sampling_grid_l_,
69
- mode='bilinear',
70
- padding_mode='zeros',
71
- align_corners=False)
70
+ sampling_value_l_ = F.grid_sample(
71
+ value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
72
+ )
72
73
  sampling_value_list.append(sampling_value_l_)
73
74
  # (bs, num_queries, num_heads, num_levels, num_points) ->
74
75
  # (bs, num_heads, num_queries, num_levels, num_points) ->
75
76
  # (bs, num_heads, 1, num_queries, num_levels*num_points)
76
- attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
77
- num_levels * num_points)
78
- output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
79
- bs, num_heads * embed_dims, num_queries))
77
+ attention_weights = attention_weights.transpose(1, 2).reshape(
78
+ bs * num_heads, 1, num_queries, num_levels * num_points
79
+ )
80
+ output = (
81
+ (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
82
+ .sum(-1)
83
+ .view(bs, num_heads * embed_dims, num_queries)
84
+ )
80
85
  return output.transpose(1, 2).contiguous()