PyPI - ultralytics - Versions diffs - 8.3.98__py3-none-any.whl → 8.3.100__py3-none-any.whl - Mend

ultralytics 8.3.98py3-none-any.whl → 8.3.100py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

tests/test_python.py +56 -0
ultralytics/__init__.py +3 -2
ultralytics/cfg/models/11/yoloe-11-seg.yaml +48 -0
ultralytics/cfg/models/11/yoloe-11.yaml +48 -0
ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +45 -0
ultralytics/cfg/models/v8/yoloe-v8.yaml +45 -0
ultralytics/data/augment.py +101 -5
ultralytics/data/dataset.py +165 -12
ultralytics/engine/exporter.py +5 -4
ultralytics/engine/trainer.py +16 -7
ultralytics/models/__init__.py +2 -2
ultralytics/models/yolo/__init__.py +3 -3
ultralytics/models/yolo/detect/val.py +6 -1
ultralytics/models/yolo/model.py +183 -3
ultralytics/models/yolo/segment/val.py +43 -16
ultralytics/models/yolo/yoloe/__init__.py +21 -0
ultralytics/models/yolo/yoloe/predict.py +170 -0
ultralytics/models/yolo/yoloe/train.py +355 -0
ultralytics/models/yolo/yoloe/train_seg.py +141 -0
ultralytics/models/yolo/yoloe/val.py +187 -0
ultralytics/nn/autobackend.py +17 -7
ultralytics/nn/modules/__init__.py +18 -1
ultralytics/nn/modules/block.py +17 -1
ultralytics/nn/modules/head.py +359 -22
ultralytics/nn/tasks.py +276 -10
ultralytics/nn/text_model.py +193 -0
ultralytics/utils/benchmarks.py +1 -0
ultralytics/utils/callbacks/comet.py +3 -6
ultralytics/utils/downloads.py +6 -2
ultralytics/utils/loss.py +67 -6
ultralytics/utils/plotting.py +1 -1
ultralytics/utils/tal.py +1 -1
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/METADATA +10 -10
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/RECORD +38 -28
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/WHEEL +0 -0
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/entry_points.txt +0 -0
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/licenses/LICENSE +0 -0
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/top_level.txt +0 -0

ultralytics/nn/modules/head.py CHANGED Viewed

@@ -6,16 +6,18 @@ import math
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.nn.init import constant_, xavier_uniform_
 from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
+from ultralytics.utils.torch_utils import fuse_conv_and_bn, smart_inference_mode
 from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
 from .conv import Conv, DWConv
 from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
 from .utils import bias_init_with_prob, linear_init
-__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect"
+__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect", "YOLOEDetect", "YOLOESegment"
 class Detect(nn.Module):
@@ -78,11 +80,12 @@ class Detect(nn.Module):
         Performs forward pass of the v10Detect module.
         Args:
-            x (tensor): Input tensor.
+            x (List[torch.Tensor]): Input feature maps from different levels.
         Returns:
-            (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
-                           If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
+            (dict | tuple): If in training mode, returns a dictionary containing the outputs of both one2many and
+                one2one detections. If not in training mode, returns processed detections or a tuple with
+                processed detections and raw outputs.
         """
         x_detach = [xi.detach() for xi in x]
         one2one = [
@@ -98,7 +101,15 @@ class Detect(nn.Module):
         return y if self.export else (y, {"one2many": x, "one2one": one2one})
     def _inference(self, x):
-        """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
+        """
+        Decode predicted bounding boxes and class probabilities based on multiple-level feature maps.
+        Args:
+            x (List[torch.Tensor]): List of feature maps from different detection layers.
+        Returns:
+            (torch.Tensor): Concatenated tensor of decoded bounding boxes and class probabilities.
+        """
         # Inference path
         shape = x[0].shape  # BCHW
         x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
@@ -320,19 +331,216 @@ class WorldDetect(Detect):
             x[i] = torch.cat((self.cv2[i](x[i]), self.cv4[i](self.cv3[i](x[i]), text)), 1)
         if self.training:
             return x
+        self.no = self.nc + self.reg_max * 4  # self.nc could be changed when inference with different texts
+        y = self._inference(x)
+        return y if self.export else (y, x)
-        # Inference path
-        shape = x[0].shape  # BCHW
-        x_cat = torch.cat([xi.view(shape[0], self.nc + self.reg_max * 4, -1) for xi in x], 2)
-        if self.dynamic or self.shape != shape:
-            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
-            self.shape = shape
+    def bias_init(self):
+        """Initialize Detect() biases, WARNING: requires stride availability."""
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
-        if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV ops
-            box = x_cat[:, : self.reg_max * 4]
-            cls = x_cat[:, self.reg_max * 4 :]
+class SAVPE(nn.Module):
+    """Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
+    def __init__(self, ch, c3, embed):
+        """Initialize SAVPE module with channels, intermediate channels, and embedding dimension."""
+        super().__init__()
+        self.cv1 = nn.ModuleList(
+            nn.Sequential(
+                Conv(x, c3, 3), Conv(c3, c3, 3), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity()
+            )
+            for i, x in enumerate(ch)
+        )
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c3, 1), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity())
+            for i, x in enumerate(ch)
+        )
+        self.c = 16
+        self.cv3 = nn.Conv2d(3 * c3, embed, 1)
+        self.cv4 = nn.Conv2d(3 * c3, self.c, 3, padding=1)
+        self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
+        self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
+    def forward(self, x, vp):
+        """Process input features and visual prompts to generate enhanced embeddings."""
+        y = [self.cv2[i](xi) for i, xi in enumerate(x)]
+        y = self.cv4(torch.cat(y, dim=1))
+        x = [self.cv1[i](xi) for i, xi in enumerate(x)]
+        x = self.cv3(torch.cat(x, dim=1))
+        B, C, H, W = x.shape
+        Q = vp.shape[1]
+        x = x.view(B, C, -1)
+        y = y.reshape(B, 1, self.c, H, W).expand(-1, Q, -1, -1, -1).reshape(B * Q, self.c, H, W)
+        vp = vp.reshape(B, Q, 1, H, W).reshape(B * Q, 1, H, W)
+        y = self.cv6(torch.cat((y, self.cv5(vp)), dim=1))
+        y = y.reshape(B, Q, self.c, -1)
+        vp = vp.reshape(B, Q, 1, -1)
+        score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
+        score = F.softmax(score, dim=-1, dtype=torch.float).to(score.dtype)
+        aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
+        return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
+class LRPCHead(nn.Module):
+    """Lightweight Region Proposal and Classification Head for efficient object detection."""
+    def __init__(self, vocab, pf, loc, enabled=True):
+        """Initialize LRPCHead with vocabulary, proposal filter, and localization components."""
+        super().__init__()
+        self.vocab = self.conv2linear(vocab) if enabled else vocab
+        self.pf = pf
+        self.loc = loc
+        self.enabled = enabled
+    def conv2linear(self, conv):
+        """Convert a 1x1 convolutional layer to a linear layer."""
+        assert isinstance(conv, nn.Conv2d) and conv.kernel_size == (1, 1)
+        linear = nn.Linear(conv.in_channels, conv.out_channels)
+        linear.weight.data = conv.weight.view(conv.out_channels, -1).data
+        linear.bias.data = conv.bias.data
+        return linear
+    def forward(self, cls_feat, loc_feat, conf, max_det):
+        """Process classification and localization features to generate detection proposals."""
+        if self.enabled:
+            pf_score = self.pf(cls_feat)[0, 0].flatten(0)
+            mask = pf_score.sigmoid() > conf
+            cls_feat = self.vocab(cls_feat.flatten(2).transpose(-1, -2)[:, mask])
+            return (self.loc(loc_feat), cls_feat.transpose(-1, -2)), mask
         else:
-            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
+            cls_feat = self.vocab(cls_feat)
+            loc_feat = self.loc(loc_feat)
+            return (loc_feat, cls_feat.flatten(2)), torch.ones(
+                cls_feat.shape[2] * cls_feat.shape[3], device=cls_feat.device, dtype=torch.bool
+            )
+class YOLOEDetect(Detect):
+    """Head for integrating YOLO detection models with semantic understanding from text embeddings."""
+    is_fused = False
+    def __init__(self, nc=80, embed=512, with_bn=False, ch=()):
+        """Initialize YOLO detection layer with nc classes and layer channels ch."""
+        super().__init__(nc, ch)
+        c3 = max(ch[0], min(self.nc, 100))
+        assert c3 <= embed
+        assert with_bn is True
+        self.cv3 = (
+            nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, embed, 1)) for x in ch)
+            if self.legacy
+            else nn.ModuleList(
+                nn.Sequential(
+                    nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
+                    nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
+                    nn.Conv2d(c3, embed, 1),
+                )
+                for x in ch
+            )
+        )
+        self.cv4 = nn.ModuleList(BNContrastiveHead(embed) if with_bn else ContrastiveHead() for _ in ch)
+        self.reprta = Residual(SwiGLUFFN(embed, embed))
+        self.savpe = SAVPE(ch, c3, embed)
+        self.embed = embed
+    @smart_inference_mode()
+    def fuse(self, txt_feats):
+        """Fuse text features with model weights for efficient inference."""
+        if self.is_fused:
+            return
+        assert not self.training
+        txt_feats = txt_feats.to(torch.float32).squeeze(0)
+        for cls_head, bn_head in zip(self.cv3, self.cv4):
+            assert isinstance(cls_head, nn.Sequential)
+            assert isinstance(bn_head, BNContrastiveHead)
+            conv = cls_head[-1]
+            assert isinstance(conv, nn.Conv2d)
+            logit_scale = bn_head.logit_scale
+            bias = bn_head.bias
+            norm = bn_head.norm
+            t = txt_feats * logit_scale.exp()
+            conv: nn.Conv2d = fuse_conv_and_bn(conv, norm)
+            w = conv.weight.data.squeeze(-1).squeeze(-1)
+            b = conv.bias.data
+            w = t @ w
+            b1 = (t @ b.reshape(-1).unsqueeze(-1)).squeeze(-1)
+            b2 = torch.ones_like(b1) * bias
+            conv = (
+                nn.Conv2d(
+                    conv.in_channels,
+                    w.shape[0],
+                    kernel_size=1,
+                )
+                .requires_grad_(False)
+                .to(conv.weight.device)
+            )
+            conv.weight.data.copy_(w.unsqueeze(-1).unsqueeze(-1))
+            conv.bias.data.copy_(b1 + b2)
+            cls_head[-1] = conv
+            bn_head.fuse()
+        del self.reprta
+        self.reprta = nn.Identity()
+        self.is_fused = True
+    def get_tpe(self, tpe):
+        """Get text prompt embeddings with normalization."""
+        return None if tpe is None else F.normalize(self.reprta(tpe), dim=-1, p=2)
+    def get_vpe(self, x, vpe):
+        """Get visual prompt embeddings with spatial awareness."""
+        if vpe.shape[1] == 0:  # no visual prompt embeddings
+            return torch.zeros(x[0].shape[0], 0, self.embed, device=x[0].device)
+        if vpe.ndim == 4:  # (B, N, H, W)
+            vpe = self.savpe(x, vpe)
+        assert vpe.ndim == 3  # (B, N, D)
+        return vpe
+    def forward_lrpc(self, x, return_mask=False):
+        """Process features with fused text embeddings to generate detections for prompt-free model."""
+        masks = []
+        assert self.is_fused, "Prompt-free inference requires model to be fused!"
+        for i in range(self.nl):
+            cls_feat = self.cv3[i](x[i])
+            loc_feat = self.cv2[i](x[i])
+            assert isinstance(self.lrpc[i], LRPCHead)
+            x[i], mask = self.lrpc[i](cls_feat, loc_feat, self.conf, self.max_det)
+            masks.append(mask)
+        shape = x[0][0].shape
+        if self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors([b[0] for b in x], self.stride, 0.5))
+            self.shape = shape
+        box = torch.cat([xi[0].view(shape[0], self.reg_max * 4, -1) for xi in x], 2)
+        cls = torch.cat([xi[1] for xi in x], 2)
         if self.export and self.format in {"tflite", "edgetpu"}:
             # Precompute normalization factor to increase numerical stability
@@ -345,17 +553,105 @@ class WorldDetect(Detect):
         else:
             dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
-        y = torch.cat((dbox, cls.sigmoid()), 1)
+        mask = torch.cat(masks)
+        y = torch.cat((dbox[:, :, mask], cls.sigmoid()), 1)
+        if return_mask:
+            return (y, mask) if self.export else ((y, x), mask)
+        else:
+            return y if self.export else (y, x)
+    def forward(self, x, cls_pe, return_mask=False):
+        """Process features with class prompt embeddings to generate detections."""
+        if hasattr(self, "lrpc"):  # for prompt-free inference
+            return self.forward_lrpc(x, return_mask)
+        for i in range(self.nl):
+            x[i] = torch.cat((self.cv2[i](x[i]), self.cv4[i](self.cv3[i](x[i]), cls_pe)), 1)
+        if self.training:
+            return x
+        self.no = self.nc + self.reg_max * 4  # self.nc could be changed when inference with different texts
+        y = self._inference(x)
         return y if self.export else (y, x)
     def bias_init(self):
-        """Initialize Detect() biases, WARNING: requires stride availability."""
+        """Initialize biases for detection heads."""
         m = self  # self.model[-1]  # Detect() module
         # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
         # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
-        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+        for a, b, c, s in zip(m.cv2, m.cv3, m.cv4, m.stride):  # from
             a[-1].bias.data[:] = 1.0  # box
             # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
+            b[-1].bias.data[:] = 0.0
+            c.bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)
+class SwiGLUFFN(nn.Module):
+    """SwiGLU Feed-Forward Network for transformer-based architectures."""
+    def __init__(self, gc, ec, e=4) -> None:
+        """Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor."""
+        super().__init__()
+        self.w12 = nn.Linear(gc, e * ec)
+        self.w3 = nn.Linear(e * ec // 2, ec)
+    def forward(self, x):
+        """Apply SwiGLU transformation to input features."""
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+class Residual(nn.Module):
+    """Residual connection wrapper for neural network modules."""
+    def __init__(self, m) -> None:
+        """Initialize residual module with the wrapped module."""
+        super().__init__()
+        self.m = m
+        nn.init.zeros_(self.m.w3.bias)
+        # For models with l scale, please change the initialization to
+        # nn.init.constant_(self.m.w3.weight, 1e-6)
+        nn.init.zeros_(self.m.w3.weight)
+    def forward(self, x):
+        """Apply residual connection to input features."""
+        return x + self.m(x)
+class YOLOESegment(YOLOEDetect):
+    """YOLO segmentation head with text embedding capabilities."""
+    def __init__(self, nc=80, nm=32, npr=256, embed=512, with_bn=False, ch=()):
+        """Initialize YOLOESegment with class count, mask parameters, and embedding dimensions."""
+        super().__init__(nc, embed, with_bn, ch)
+        self.nm = nm
+        self.npr = npr
+        self.proto = Proto(ch[0], self.npr, self.nm)
+        c5 = max(ch[0] // 4, self.nm)
+        self.cv5 = nn.ModuleList(nn.Sequential(Conv(x, c5, 3), Conv(c5, c5, 3), nn.Conv2d(c5, self.nm, 1)) for x in ch)
+    def forward(self, x, text):
+        """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
+        p = self.proto(x[0])  # mask protos
+        bs = p.shape[0]  # batch size
+        mc = torch.cat([self.cv5[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
+        has_lrpc = hasattr(self, "lrpc")
+        if not has_lrpc:
+            x = YOLOEDetect.forward(self, x, text)
+        else:
+            x, mask = YOLOEDetect.forward(self, x, text, return_mask=True)
+        if self.training:
+            return x, mc, p
+        if has_lrpc:
+            mc = mc[:, :, mask]
+        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
 class RTDETRDecoder(nn.Module):
@@ -449,7 +745,17 @@ class RTDETRDecoder(nn.Module):
         self._reset_parameters()
     def forward(self, x, batch=None):
-        """Runs the forward pass of the module, returning bounding box and classification scores for the input."""
+        """
+        Runs the forward pass of the module, returning bounding box and classification scores for the input.
+        Args:
+            x (List[torch.Tensor]): List of feature maps from the backbone.
+            batch (dict, optional): Batch information for training.
+        Returns:
+            (tuple | torch.Tensor): During training, returns a tuple of bounding boxes, scores, and other metadata.
+                During inference, returns a tensor of shape (bs, 300, 4+nc) containing bounding boxes and class scores.
+        """
         from ultralytics.models.utils.ops import get_cdn_group
         # Input projection and embedding
@@ -488,7 +794,19 @@ class RTDETRDecoder(nn.Module):
         return y if self.export else (y, x)
     def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
-        """Generates anchor bounding boxes for given shapes with specific grid size and validates them."""
+        """
+        Generates anchor bounding boxes for given shapes with specific grid size and validates them.
+        Args:
+            shapes (list): List of feature map shapes.
+            grid_size (float, optional): Base size of grid cells. Default is 0.05.
+            dtype (torch.dtype, optional): Data type for tensors. Default is torch.float32.
+            device (str, optional): Device to create tensors on. Default is "cpu".
+            eps (float, optional): Small value for numerical stability. Default is 1e-2.
+        Returns:
+            (tuple): Tuple containing anchors and valid mask tensors.
+        """
         anchors = []
         for i, (h, w) in enumerate(shapes):
             sy = torch.arange(end=h, dtype=dtype, device=device)
@@ -508,7 +826,15 @@ class RTDETRDecoder(nn.Module):
         return anchors, valid_mask
     def _get_encoder_input(self, x):
-        """Processes and returns encoder inputs by getting projection features from input and concatenating them."""
+        """
+        Processes and returns encoder inputs by getting projection features from input and concatenating them.
+        Args:
+            x (List[torch.Tensor]): List of feature maps from the backbone.
+        Returns:
+            (tuple): Tuple containing processed features and their shapes.
+        """
         # Get projection features
         x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
         # Get encoder inputs
@@ -526,7 +852,18 @@ class RTDETRDecoder(nn.Module):
         return feats, shapes
     def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
-        """Generates and prepares the input required for the decoder from the provided features and shapes."""
+        """
+        Generates and prepares the input required for the decoder from the provided features and shapes.
+        Args:
+            feats (torch.Tensor): Processed features from encoder.
+            shapes (list): List of feature map shapes.
+            dn_embed (torch.Tensor, optional): Denoising embeddings. Default is None.
+            dn_bbox (torch.Tensor, optional): Denoising bounding boxes. Default is None.
+        Returns:
+            (tuple): Tuple containing embeddings, reference bounding boxes, encoded bounding boxes, and scores.
+        """
         bs = feats.shape[0]
         # Prepare input for decoder
         anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)

ultralytics 8.3.98__py3-none-any.whl → 8.3.100__py3-none-any.whl

ultralytics 8.3.98py3-none-any.whl → 8.3.100py3-none-any.whl