PyPI - ultralytics - Versions diffs - 8.0.237__py3-none-any.whl → 8.0.239__py3-none-any.whl - Mend

ultralytics 8.0.237py3-none-any.whl → 8.0.239py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultralytics might be problematic. Click here for more details.

Files changed (137) hide show

ultralytics/__init__.py +2 -2
ultralytics/cfg/__init__.py +241 -138
ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
ultralytics/cfg/datasets/dota8.yaml +34 -0
ultralytics/data/__init__.py +9 -2
ultralytics/data/annotator.py +4 -4
ultralytics/data/augment.py +186 -169
ultralytics/data/base.py +54 -48
ultralytics/data/build.py +34 -23
ultralytics/data/converter.py +242 -70
ultralytics/data/dataset.py +117 -95
ultralytics/data/explorer/__init__.py +5 -0
ultralytics/data/explorer/explorer.py +170 -97
ultralytics/data/explorer/gui/__init__.py +1 -0
ultralytics/data/explorer/gui/dash.py +146 -76
ultralytics/data/explorer/utils.py +87 -25
ultralytics/data/loaders.py +75 -62
ultralytics/data/split_dota.py +44 -36
ultralytics/data/utils.py +160 -142
ultralytics/engine/exporter.py +348 -292
ultralytics/engine/model.py +102 -66
ultralytics/engine/predictor.py +74 -55
ultralytics/engine/results.py +63 -40
ultralytics/engine/trainer.py +192 -144
ultralytics/engine/tuner.py +66 -59
ultralytics/engine/validator.py +31 -26
ultralytics/hub/__init__.py +54 -31
ultralytics/hub/auth.py +28 -25
ultralytics/hub/session.py +282 -133
ultralytics/hub/utils.py +64 -42
ultralytics/models/__init__.py +1 -1
ultralytics/models/fastsam/__init__.py +1 -1
ultralytics/models/fastsam/model.py +6 -6
ultralytics/models/fastsam/predict.py +3 -2
ultralytics/models/fastsam/prompt.py +55 -48
ultralytics/models/fastsam/val.py +1 -1
ultralytics/models/nas/__init__.py +1 -1
ultralytics/models/nas/model.py +9 -8
ultralytics/models/nas/predict.py +8 -6
ultralytics/models/nas/val.py +11 -9
ultralytics/models/rtdetr/__init__.py +1 -1
ultralytics/models/rtdetr/model.py +11 -9
ultralytics/models/rtdetr/train.py +18 -16
ultralytics/models/rtdetr/val.py +25 -19
ultralytics/models/sam/__init__.py +1 -1
ultralytics/models/sam/amg.py +13 -14
ultralytics/models/sam/build.py +44 -42
ultralytics/models/sam/model.py +6 -6
ultralytics/models/sam/modules/decoders.py +6 -4
ultralytics/models/sam/modules/encoders.py +37 -35
ultralytics/models/sam/modules/sam.py +5 -4
ultralytics/models/sam/modules/tiny_encoder.py +95 -73
ultralytics/models/sam/modules/transformer.py +3 -2
ultralytics/models/sam/predict.py +39 -27
ultralytics/models/utils/loss.py +99 -95
ultralytics/models/utils/ops.py +34 -31
ultralytics/models/yolo/__init__.py +1 -1
ultralytics/models/yolo/classify/__init__.py +1 -1
ultralytics/models/yolo/classify/predict.py +8 -6
ultralytics/models/yolo/classify/train.py +37 -31
ultralytics/models/yolo/classify/val.py +26 -24
ultralytics/models/yolo/detect/__init__.py +1 -1
ultralytics/models/yolo/detect/predict.py +8 -6
ultralytics/models/yolo/detect/train.py +47 -37
ultralytics/models/yolo/detect/val.py +100 -82
ultralytics/models/yolo/model.py +31 -25
ultralytics/models/yolo/obb/__init__.py +1 -1
ultralytics/models/yolo/obb/predict.py +13 -12
ultralytics/models/yolo/obb/train.py +3 -3
ultralytics/models/yolo/obb/val.py +80 -58
ultralytics/models/yolo/pose/__init__.py +1 -1
ultralytics/models/yolo/pose/predict.py +17 -12
ultralytics/models/yolo/pose/train.py +28 -25
ultralytics/models/yolo/pose/val.py +91 -64
ultralytics/models/yolo/segment/__init__.py +1 -1
ultralytics/models/yolo/segment/predict.py +10 -8
ultralytics/models/yolo/segment/train.py +16 -15
ultralytics/models/yolo/segment/val.py +90 -68
ultralytics/nn/__init__.py +26 -6
ultralytics/nn/autobackend.py +144 -112
ultralytics/nn/modules/__init__.py +96 -13
ultralytics/nn/modules/block.py +28 -7
ultralytics/nn/modules/conv.py +41 -23
ultralytics/nn/modules/head.py +67 -59
ultralytics/nn/modules/transformer.py +49 -32
ultralytics/nn/modules/utils.py +20 -15
ultralytics/nn/tasks.py +215 -141
ultralytics/solutions/ai_gym.py +59 -47
ultralytics/solutions/distance_calculation.py +22 -15
ultralytics/solutions/heatmap.py +76 -54
ultralytics/solutions/object_counter.py +46 -39
ultralytics/solutions/speed_estimation.py +13 -16
ultralytics/trackers/__init__.py +1 -1
ultralytics/trackers/basetrack.py +1 -0
ultralytics/trackers/bot_sort.py +2 -1
ultralytics/trackers/byte_tracker.py +10 -7
ultralytics/trackers/track.py +7 -7
ultralytics/trackers/utils/gmc.py +25 -25
ultralytics/trackers/utils/kalman_filter.py +85 -42
ultralytics/trackers/utils/matching.py +8 -7
ultralytics/utils/__init__.py +173 -151
ultralytics/utils/autobatch.py +10 -10
ultralytics/utils/benchmarks.py +76 -86
ultralytics/utils/callbacks/__init__.py +1 -1
ultralytics/utils/callbacks/base.py +29 -29
ultralytics/utils/callbacks/clearml.py +51 -43
ultralytics/utils/callbacks/comet.py +81 -66
ultralytics/utils/callbacks/dvc.py +33 -26
ultralytics/utils/callbacks/hub.py +44 -26
ultralytics/utils/callbacks/mlflow.py +31 -24
ultralytics/utils/callbacks/neptune.py +35 -25
ultralytics/utils/callbacks/raytune.py +9 -4
ultralytics/utils/callbacks/tensorboard.py +16 -11
ultralytics/utils/callbacks/wb.py +39 -33
ultralytics/utils/checks.py +189 -141
ultralytics/utils/dist.py +15 -12
ultralytics/utils/downloads.py +112 -96
ultralytics/utils/errors.py +1 -1
ultralytics/utils/files.py +11 -11
ultralytics/utils/instance.py +22 -22
ultralytics/utils/loss.py +117 -67
ultralytics/utils/metrics.py +224 -158
ultralytics/utils/ops.py +39 -29
ultralytics/utils/patches.py +3 -3
ultralytics/utils/plotting.py +217 -120
ultralytics/utils/tal.py +19 -13
ultralytics/utils/torch_utils.py +138 -109
ultralytics/utils/triton.py +12 -10
ultralytics/utils/tuner.py +49 -47
{ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/METADATA +5 -4
ultralytics-8.0.239.dist-info/RECORD +188 -0
ultralytics-8.0.237.dist-info/RECORD +0 -187
{ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/LICENSE +0 -0
{ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/WHEEL +0 -0
{ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/entry_points.txt +0 -0
{ultralytics-8.0.237.dist-info → ultralytics-8.0.239.dist-info}/top_level.txt +0 -0

ultralytics/nn/modules/block.py CHANGED Viewed

@@ -8,8 +8,26 @@ import torch.nn.functional as F
 from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
 from .transformer import TransformerBlock
-__all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
-           'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3', 'ResNetLayer')
+__all__ = (
+    "DFL",
+    "HGBlock",
+    "HGStem",
+    "SPP",
+    "SPPF",
+    "C1",
+    "C2",
+    "C3",
+    "C2f",
+    "C3x",
+    "C3TR",
+    "C3Ghost",
+    "GhostBottleneck",
+    "Bottleneck",
+    "BottleneckCSP",
+    "Proto",
+    "RepC3",
+    "ResNetLayer",
+)
 class DFL(nn.Module):
@@ -284,9 +302,11 @@ class GhostBottleneck(nn.Module):
         self.conv = nn.Sequential(
             GhostConv(c1, c_, 1, 1),  # pw
             DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
-            GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
-        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
-                                                                            act=False)) if s == 2 else nn.Identity()
+            GhostConv(c_, c2, 1, 1, act=False),  # pw-linear
+        )
+        self.shortcut = (
+            nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
+        )
     def forward(self, x):
         """Applies skip connection and concatenation to input tensor."""
@@ -359,8 +379,9 @@ class ResNetLayer(nn.Module):
         self.is_first = is_first
         if self.is_first:
-            self.layer = nn.Sequential(Conv(c1, c2, k=7, s=2, p=3, act=True),
-                                       nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+            self.layer = nn.Sequential(
+                Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+            )
         else:
             blocks = [ResNetBlock(c1, c2, s, e=e)]
             blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])

ultralytics/nn/modules/conv.py CHANGED Viewed

@@ -7,8 +7,21 @@ import numpy as np
 import torch
 import torch.nn as nn
-__all__ = ('Conv', 'Conv2', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
-           'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'RepConv')
+__all__ = (
+    "Conv",
+    "Conv2",
+    "LightConv",
+    "DWConv",
+    "DWConvTranspose2d",
+    "ConvTranspose",
+    "Focus",
+    "GhostConv",
+    "ChannelAttention",
+    "SpatialAttention",
+    "CBAM",
+    "Concat",
+    "RepConv",
+)
 def autopad(k, p=None, d=1):  # kernel, padding, dilation
@@ -22,6 +35,7 @@ def autopad(k, p=None, d=1):  # kernel, padding, dilation
 class Conv(nn.Module):
     """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
     default_act = nn.SiLU()  # default activation
     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
@@ -60,9 +74,9 @@ class Conv2(Conv):
         """Fuse parallel convolutions."""
         w = torch.zeros_like(self.conv.weight.data)
         i = [x // 2 for x in w.shape[2:]]
-        w[:, :, i[0]:i[0] + 1, i[1]:i[1] + 1] = self.cv2.weight.data.clone()
+        w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
         self.conv.weight.data += w
-        self.__delattr__('cv2')
+        self.__delattr__("cv2")
         self.forward = self.forward_fuse
@@ -102,6 +116,7 @@ class DWConvTranspose2d(nn.ConvTranspose2d):
 class ConvTranspose(nn.Module):
     """Convolution transpose 2d layer."""
     default_act = nn.SiLU()  # default activation
     def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
@@ -164,6 +179,7 @@ class RepConv(nn.Module):
     This module is used in RT-DETR.
     Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
     """
     default_act = nn.SiLU()  # default activation
     def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
@@ -214,7 +230,7 @@ class RepConv(nn.Module):
             beta = branch.bn.bias
             eps = branch.bn.eps
         elif isinstance(branch, nn.BatchNorm2d):
-            if not hasattr(self, 'id_tensor'):
+            if not hasattr(self, "id_tensor"):
                 input_dim = self.c1 // self.g
                 kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
                 for i in range(self.c1):
@@ -232,29 +248,31 @@ class RepConv(nn.Module):
     def fuse_convs(self):
         """Combines two convolution layers into a single layer and removes unused attributes from the class."""
-        if hasattr(self, 'conv'):
+        if hasattr(self, "conv"):
             return
         kernel, bias = self.get_equivalent_kernel_bias()
-        self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
-                              out_channels=self.conv1.conv.out_channels,
-                              kernel_size=self.conv1.conv.kernel_size,
-                              stride=self.conv1.conv.stride,
-                              padding=self.conv1.conv.padding,
-                              dilation=self.conv1.conv.dilation,
-                              groups=self.conv1.conv.groups,
-                              bias=True).requires_grad_(False)
+        self.conv = nn.Conv2d(
+            in_channels=self.conv1.conv.in_channels,
+            out_channels=self.conv1.conv.out_channels,
+            kernel_size=self.conv1.conv.kernel_size,
+            stride=self.conv1.conv.stride,
+            padding=self.conv1.conv.padding,
+            dilation=self.conv1.conv.dilation,
+            groups=self.conv1.conv.groups,
+            bias=True,
+        ).requires_grad_(False)
         self.conv.weight.data = kernel
         self.conv.bias.data = bias
         for para in self.parameters():
             para.detach_()
-        self.__delattr__('conv1')
-        self.__delattr__('conv2')
-        if hasattr(self, 'nm'):
-            self.__delattr__('nm')
-        if hasattr(self, 'bn'):
-            self.__delattr__('bn')
-        if hasattr(self, 'id_tensor'):
-            self.__delattr__('id_tensor')
+        self.__delattr__("conv1")
+        self.__delattr__("conv2")
+        if hasattr(self, "nm"):
+            self.__delattr__("nm")
+        if hasattr(self, "bn"):
+            self.__delattr__("bn")
+        if hasattr(self, "id_tensor"):
+            self.__delattr__("id_tensor")
 class ChannelAttention(nn.Module):
@@ -278,7 +296,7 @@ class SpatialAttention(nn.Module):
     def __init__(self, kernel_size=7):
         """Initialize Spatial-attention module with kernel size argument."""
         super().__init__()
-        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
+        assert kernel_size in (3, 7), "kernel size must be 3 or 7"
         padding = 3 if kernel_size == 7 else 1
         self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
         self.act = nn.Sigmoid()

ultralytics/nn/modules/head.py CHANGED Viewed

@@ -14,11 +14,12 @@ from .conv import Conv
 from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
 from .utils import bias_init_with_prob, linear_init_
-__all__ = 'Detect', 'Segment', 'Pose', 'Classify', 'OBB', 'RTDETRDecoder'
+__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
 class Detect(nn.Module):
     """YOLOv8 Detect head for detection models."""
     dynamic = False  # force grid reconstruction
     export = False  # export mode
     shape = None
@@ -35,7 +36,8 @@ class Detect(nn.Module):
         self.stride = torch.zeros(self.nl)  # strides computed during build
         c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
         self.cv2 = nn.ModuleList(
-            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
+        )
         self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
         self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
@@ -53,21 +55,21 @@ class Detect(nn.Module):
             self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
             self.shape = shape
-        if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'):  # avoid TF FlexSplitV ops
-            box = x_cat[:, :self.reg_max * 4]
-            cls = x_cat[:, self.reg_max * 4:]
+        if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"):  # avoid TF FlexSplitV ops
+            box = x_cat[:, : self.reg_max * 4]
+            cls = x_cat[:, self.reg_max * 4 :]
         else:
             box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
         dbox = self.decode_bboxes(box)
-        if self.export and self.format in ('tflite', 'edgetpu'):
-            # Normalize xywh with image size to mitigate quantization error of TFLite integer models as done in YOLOv5:
-            # https://github.com/ultralytics/yolov5/blob/0c8de3fca4a702f8ff5c435e67f378d1fce70243/models/tf.py#L307-L309
-            # See this PR for details: https://github.com/ultralytics/ultralytics/pull/1695
-            img_h = shape[2] * self.stride[0]
-            img_w = shape[3] * self.stride[0]
-            img_size = torch.tensor([img_w, img_h, img_w, img_h], device=dbox.device).reshape(1, 4, 1)
-            dbox /= img_size
+        if self.export and self.format in ("tflite", "edgetpu"):
+            # Precompute normalization factor to increase numerical stability
+            # See https://github.com/ultralytics/ultralytics/issues/7371
+            img_h = shape[2]
+            img_w = shape[3]
+            img_size = torch.tensor([img_w, img_h, img_w, img_h], device=box.device).reshape(1, 4, 1)
+            norm = self.strides / (self.stride[0] * img_size)
+            dbox = dist2bbox(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2], xywh=True, dim=1)
         y = torch.cat((dbox, cls.sigmoid()), 1)
         return y if self.export else (y, x)
@@ -79,7 +81,7 @@ class Detect(nn.Module):
         # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
         for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
             a[-1].bias.data[:] = 1.0  # box
-            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
+            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
     def decode_bboxes(self, bboxes):
         """Decode bounding boxes."""
@@ -116,6 +118,7 @@ class OBB(Detect):
     """YOLOv8 OBB detection head for detection with rotation models."""
     def __init__(self, nc=80, ne=1, ch=()):
+        """Initialize OBB with number of classes `nc` and layer channels `ch`."""
         super().__init__(nc, ch)
         self.ne = ne  # number of extra parameters
         self.detect = Detect.forward
@@ -124,6 +127,7 @@ class OBB(Detect):
         self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.ne, 1)) for x in ch)
     def forward(self, x):
+        """Concatenates and returns predicted bounding boxes and class probabilities."""
         bs = x[0].shape[0]  # batch size
         angle = torch.cat([self.cv4[i](x[i]).view(bs, self.ne, -1) for i in range(self.nl)], 2)  # OBB theta logits
         # NOTE: set `angle` as an attribute so that `decode_bboxes` could use it.
@@ -212,26 +216,28 @@ class RTDETRDecoder(nn.Module):
     and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
     Transformer decoder layers to output the final predictions.
     """
     export = False  # export mode
     def __init__(
-            self,
-            nc=80,
-            ch=(512, 1024, 2048),
-            hd=256,  # hidden dim
-            nq=300,  # num queries
-            ndp=4,  # num decoder points
-            nh=8,  # num head
-            ndl=6,  # num decoder layers
-            d_ffn=1024,  # dim of feedforward
-            dropout=0.,
-            act=nn.ReLU(),
-            eval_idx=-1,
-            # Training args
-            nd=100,  # num denoising
-            label_noise_ratio=0.5,
-            box_noise_scale=1.0,
-            learnt_init_query=False):
+        self,
+        nc=80,
+        ch=(512, 1024, 2048),
+        hd=256,  # hidden dim
+        nq=300,  # num queries
+        ndp=4,  # num decoder points
+        nh=8,  # num head
+        ndl=6,  # num decoder layers
+        d_ffn=1024,  # dim of feedforward
+        dropout=0.0,
+        act=nn.ReLU(),
+        eval_idx=-1,
+        # Training args
+        nd=100,  # num denoising
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learnt_init_query=False,
+    ):
         """
         Initializes the RTDETRDecoder module with the given parameters.
@@ -300,28 +306,30 @@ class RTDETRDecoder(nn.Module):
         feats, shapes = self._get_encoder_input(x)
         # Prepare denoising training
-        dn_embed, dn_bbox, attn_mask, dn_meta = \
-            get_cdn_group(batch,
-                          self.nc,
-                          self.num_queries,
-                          self.denoising_class_embed.weight,
-                          self.num_denoising,
-                          self.label_noise_ratio,
-                          self.box_noise_scale,
-                          self.training)
-        embed, refer_bbox, enc_bboxes, enc_scores = \
-            self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
+        dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
+            batch,
+            self.nc,
+            self.num_queries,
+            self.denoising_class_embed.weight,
+            self.num_denoising,
+            self.label_noise_ratio,
+            self.box_noise_scale,
+            self.training,
+        )
+        embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
         # Decoder
-        dec_bboxes, dec_scores = self.decoder(embed,
-                                              refer_bbox,
-                                              feats,
-                                              shapes,
-                                              self.dec_bbox_head,
-                                              self.dec_score_head,
-                                              self.query_pos_head,
-                                              attn_mask=attn_mask)
+        dec_bboxes, dec_scores = self.decoder(
+            embed,
+            refer_bbox,
+            feats,
+            shapes,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask,
+        )
         x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
         if self.training:
             return x
@@ -329,24 +337,24 @@ class RTDETRDecoder(nn.Module):
         y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
         return y if self.export else (y, x)
-    def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
+    def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
         """Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
         anchors = []
         for i, (h, w) in enumerate(shapes):
             sy = torch.arange(end=h, dtype=dtype, device=device)
             sx = torch.arange(end=w, dtype=dtype, device=device)
-            grid_y, grid_x = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx)
+            grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
             grid_xy = torch.stack([grid_x, grid_y], -1)  # (h, w, 2)
             valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
             grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH  # (1, h, w, 2)
-            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0 ** i)
+            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
             anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4))  # (1, h*w, 4)
         anchors = torch.cat(anchors, 1)  # (1, h*w*nl, 4)
         valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)  # 1, h*w*nl, 1
         anchors = torch.log(anchors / (1 - anchors))
-        anchors = anchors.masked_fill(~valid_mask, float('inf'))
+        anchors = anchors.masked_fill(~valid_mask, float("inf"))
         return anchors, valid_mask
     def _get_encoder_input(self, x):
@@ -413,13 +421,13 @@ class RTDETRDecoder(nn.Module):
         # NOTE: the weight initialization in `linear_init_` would cause NaN when training with custom datasets.
         # linear_init_(self.enc_score_head)
         constant_(self.enc_score_head.bias, bias_cls)
-        constant_(self.enc_bbox_head.layers[-1].weight, 0.)
-        constant_(self.enc_bbox_head.layers[-1].bias, 0.)
+        constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
+        constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
         for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
             # linear_init_(cls_)
             constant_(cls_.bias, bias_cls)
-            constant_(reg_.layers[-1].weight, 0.)
-            constant_(reg_.layers[-1].bias, 0.)
+            constant_(reg_.layers[-1].weight, 0.0)
+            constant_(reg_.layers[-1].bias, 0.0)
         linear_init_(self.enc_output[0])
         xavier_uniform_(self.enc_output[0].weight)

ultralytics/nn/modules/transformer.py CHANGED Viewed

@@ -11,8 +11,18 @@ from torch.nn.init import constant_, xavier_uniform_
 from .conv import Conv
 from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
-__all__ = ('TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI',
-           'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP')
+__all__ = (
+    "TransformerEncoderLayer",
+    "TransformerLayer",
+    "TransformerBlock",
+    "MLPBlock",
+    "LayerNorm2d",
+    "AIFI",
+    "DeformableTransformerDecoder",
+    "DeformableTransformerDecoderLayer",
+    "MSDeformAttn",
+    "MLP",
+)
 class TransformerEncoderLayer(nn.Module):
@@ -22,9 +32,11 @@ class TransformerEncoderLayer(nn.Module):
         """Initialize the TransformerEncoderLayer with specified parameters."""
         super().__init__()
         from ...utils.torch_utils import TORCH_1_9
         if not TORCH_1_9:
             raise ModuleNotFoundError(
-                'TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).')
+                "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
+            )
         self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
         # Implementation of Feedforward model
         self.fc1 = nn.Linear(c1, cm)
@@ -91,12 +103,11 @@ class AIFI(TransformerEncoderLayer):
         """Builds 2D sine-cosine position embedding."""
         grid_w = torch.arange(int(w), dtype=torch.float32)
         grid_h = torch.arange(int(h), dtype=torch.float32)
-        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
-        assert embed_dim % 4 == 0, \
-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
         pos_dim = embed_dim // 4
         omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
-        omega = 1. / (temperature ** omega)
+        omega = 1.0 / (temperature**omega)
         out_w = grid_w.flatten()[..., None] @ omega[None]
         out_h = grid_h.flatten()[..., None] @ omega[None]
@@ -213,10 +224,10 @@ class MSDeformAttn(nn.Module):
         """Initialize MSDeformAttn with the given parameters."""
         super().__init__()
         if d_model % n_heads != 0:
-            raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
+            raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
         _d_per_head = d_model // n_heads
         # Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
-        assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`'
+        assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"
         self.im2col_step = 64
@@ -234,21 +245,24 @@ class MSDeformAttn(nn.Module):
     def _reset_parameters(self):
         """Reset module parameters."""
-        constant_(self.sampling_offsets.weight.data, 0.)
+        constant_(self.sampling_offsets.weight.data, 0.0)
         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(
-            1, self.n_levels, self.n_points, 1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
         for i in range(self.n_points):
             grid_init[:, :, i, :] *= i + 1
         with torch.no_grad():
             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.)
-        constant_(self.attention_weights.bias.data, 0.)
+        constant_(self.attention_weights.weight.data, 0.0)
+        constant_(self.attention_weights.bias.data, 0.0)
         xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.)
+        constant_(self.value_proj.bias.data, 0.0)
         xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.)
+        constant_(self.output_proj.bias.data, 0.0)
     def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
         """
@@ -288,7 +302,7 @@ class MSDeformAttn(nn.Module):
             add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
             sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
         else:
-            raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {num_points}.')
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
         output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
         return self.output_proj(output)
@@ -301,7 +315,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
     https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
     """
-    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
+    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
         """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
         super().__init__()
@@ -339,14 +353,16 @@ class DeformableTransformerDecoderLayer(nn.Module):
         # Self attention
         q = k = self.with_pos_embed(embed, query_pos)
-        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1),
-                             attn_mask=attn_mask)[0].transpose(0, 1)
+        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
+            0
+        ].transpose(0, 1)
         embed = embed + self.dropout1(tgt)
         embed = self.norm1(embed)
         # Cross attention
-        tgt = self.cross_attn(self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes,
-                              padding_mask)
+        tgt = self.cross_attn(
+            self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
+        )
         embed = embed + self.dropout2(tgt)
         embed = self.norm2(embed)
@@ -370,16 +386,17 @@ class DeformableTransformerDecoder(nn.Module):
         self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
     def forward(
-            self,
-            embed,  # decoder embeddings
-            refer_bbox,  # anchor
-            feats,  # image features
-            shapes,  # feature shapes
-            bbox_head,
-            score_head,
-            pos_mlp,
-            attn_mask=None,
-            padding_mask=None):
+        self,
+        embed,  # decoder embeddings
+        refer_bbox,  # anchor
+        feats,  # image features
+        shapes,  # feature shapes
+        bbox_head,
+        score_head,
+        pos_mlp,
+        attn_mask=None,
+        padding_mask=None,
+    ):
         """Perform the forward pass through the entire decoder."""
         output = embed
         dec_bboxes = []

ultralytics/nn/modules/utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.init import uniform_
-__all__ = 'multi_scale_deformable_attn_pytorch', 'inverse_sigmoid'
+__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
 def _get_clones(module, n):
@@ -27,7 +27,7 @@ def linear_init_(module):
     """Initialize the weights and biases of a linear module."""
     bound = 1 / math.sqrt(module.weight.shape[0])
     uniform_(module.weight, -bound, bound)
-    if hasattr(module, 'bias') and module.bias is not None:
+    if hasattr(module, "bias") and module.bias is not None:
         uniform_(module.bias, -bound, bound)
@@ -39,9 +39,12 @@ def inverse_sigmoid(x, eps=1e-5):
     return torch.log(x1 / x2)
-def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
-                                        sampling_locations: torch.Tensor,
-                                        attention_weights: torch.Tensor) -> torch.Tensor:
+def multi_scale_deformable_attn_pytorch(
+    value: torch.Tensor,
+    value_spatial_shapes: torch.Tensor,
+    sampling_locations: torch.Tensor,
+    attention_weights: torch.Tensor,
+) -> torch.Tensor:
     """
     Multi-scale deformable attention.
@@ -58,23 +61,25 @@ def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shape
         # bs, H_*W_, num_heads*embed_dims ->
         # bs, num_heads*embed_dims, H_*W_ ->
         # bs*num_heads, embed_dims, H_, W_
-        value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
         # bs, num_queries, num_heads, num_points, 2 ->
         # bs, num_heads, num_queries, num_points, 2 ->
         # bs*num_heads, num_queries, num_points, 2
         sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
         # bs*num_heads, embed_dims, num_queries, num_points
-        sampling_value_l_ = F.grid_sample(value_l_,
-                                          sampling_grid_l_,
-                                          mode='bilinear',
-                                          padding_mode='zeros',
-                                          align_corners=False)
+        sampling_value_l_ = F.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
         sampling_value_list.append(sampling_value_l_)
     # (bs, num_queries, num_heads, num_levels, num_points) ->
     # (bs, num_heads, num_queries, num_levels, num_points) ->
     # (bs, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
-                                                                  num_levels * num_points)
-    output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
-        bs, num_heads * embed_dims, num_queries))
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(bs, num_heads * embed_dims, num_queries)
+    )
     return output.transpose(1, 2).contiguous()

ultralytics 8.0.237__py3-none-any.whl → 8.0.239__py3-none-any.whl

Potentially problematic release.

ultralytics 8.0.237py3-none-any.whl → 8.0.239py3-none-any.whl