PyPI - ultralytics - Versions diffs - 8.2.103__py3-none-any.whl → 8.3.0__py3-none-any.whl - Mend

ultralytics 8.2.103py3-none-any.whl → 8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultralytics might be problematic. Click here for more details.

Files changed (33) hide show

tests/__init__.py +2 -2
tests/conftest.py +1 -1
tests/test_cuda.py +1 -1
tests/test_engine.py +5 -5
tests/test_explorer.py +3 -3
tests/test_exports.py +1 -2
tests/test_integrations.py +9 -9
tests/test_python.py +11 -11
tests/test_solutions.py +3 -3
ultralytics/__init__.py +1 -2
ultralytics/cfg/datasets/hand-keypoints.yaml +2 -2
ultralytics/cfg/default.yaml +1 -0
ultralytics/cfg/models/11/yolo11-cls.yaml +30 -0
ultralytics/cfg/models/11/yolo11-obb.yaml +47 -0
ultralytics/cfg/models/11/yolo11-pose.yaml +48 -0
ultralytics/cfg/models/11/yolo11-seg.yaml +47 -0
ultralytics/cfg/models/11/yolo11.yaml +47 -0
ultralytics/data/augment.py +101 -80
ultralytics/engine/trainer.py +8 -1
ultralytics/nn/modules/__init__.py +7 -1
ultralytics/nn/modules/block.py +198 -37
ultralytics/nn/modules/conv.py +2 -1
ultralytics/nn/modules/head.py +9 -2
ultralytics/nn/tasks.py +25 -2
ultralytics/utils/downloads.py +3 -2
ultralytics/utils/loss.py +3 -0
ultralytics/utils/torch_utils.py +1 -1
{ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/METADATA +51 -65
{ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/RECORD +33 -28
{ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/LICENSE +0 -0
{ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/WHEEL +0 -0
{ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/entry_points.txt +0 -0
{ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/top_level.txt +0 -0

ultralytics/data/augment.py CHANGED Viewed

@@ -1628,92 +1628,105 @@ class LetterBox:
         return labels
-class CopyPaste:
+class CopyPaste(BaseMixTransform):
     """
-    Implements Copy-Paste augmentation as described in https://arxiv.org/abs/2012.07177.
+    CopyPaste class for applying Copy-Paste augmentation to image datasets.
-    This class applies Copy-Paste augmentation on images and their corresponding instances.
+    This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
+    Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
+    different images to create new training samples.
     Attributes:
-        p (float): Probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
+        dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
+        pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
+        p (float): Probability of applying Copy-Paste augmentation.
     Methods:
-        __call__: Applies Copy-Paste augmentation to given image and instances.
+        get_indexes: Returns a random index from the dataset.
+        _mix_transform: Applies Copy-Paste augmentation to the input labels.
+        __call__: Applies the Copy-Paste transformation to images and annotations.
     Examples:
-        >>> copypaste = CopyPaste(p=0.5)
-        >>> augmented_labels = copypaste(labels)
-        >>> augmented_image = augmented_labels["img"]
+        >>> from ultralytics.data.augment import CopyPaste
+        >>> dataset = YourDataset(...)  # Your image dataset
+        >>> copypaste = CopyPaste(dataset, p=0.5)
+        >>> augmented_labels = copypaste(original_labels)
     """
-    def __init__(self, p=0.5) -> None:
-        """
-        Initializes the CopyPaste augmentation object.
+    def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
+        """Initializes CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
+        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
+        assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
+        self.mode = mode
-        This class implements the Copy-Paste augmentation as described in the paper "Simple Copy-Paste is a Strong Data
-        Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It applies the Copy-Paste
-        augmentation on images and their corresponding instances with a given probability.
+    def get_indexes(self):
+        """Returns a list of random indexes from the dataset for CopyPaste augmentation."""
+        return random.randint(0, len(self.dataset) - 1)
-        Args:
-            p (float): The probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
+    def _mix_transform(self, labels):
+        """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
+        labels2 = labels["mix_labels"][0]
+        return self._transform(labels, labels2)
-        Attributes:
-            p (float): Stores the probability of applying the augmentation.
+    def __call__(self, labels):
+        """Applies Copy-Paste augmentation to an image and its labels."""
+        if len(labels["instances"].segments) == 0 or self.p == 0:
+            return labels
+        if self.mode == "flip":
+            return self._transform(labels)
-        Examples:
-            >>> augment = CopyPaste(p=0.7)
-            >>> augmented_data = augment(original_data)
-        """
-        self.p = p
+        # Get index of one or three other images
+        indexes = self.get_indexes()
+        if isinstance(indexes, int):
+            indexes = [indexes]
-    def __call__(self, labels):
-        """
-        Applies Copy-Paste augmentation to an image and its instances.
+        # Get images information will be used for Mosaic or MixUp
+        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
-        Args:
-            labels (Dict): A dictionary containing:
-                - 'img' (np.ndarray): The image to augment.
-                - 'cls' (np.ndarray): Class labels for the instances.
-                - 'instances' (ultralytics.engine.results.Instances): Object containing bounding boxes, segments, etc.
+        if self.pre_transform is not None:
+            for i, data in enumerate(mix_labels):
+                mix_labels[i] = self.pre_transform(data)
+        labels["mix_labels"] = mix_labels
-        Returns:
-            (Dict): Dictionary with augmented image and updated instances under 'img', 'cls', and 'instances' keys.
+        # Update cls and texts
+        labels = self._update_label_text(labels)
+        # Mosaic or MixUp
+        labels = self._mix_transform(labels)
+        labels.pop("mix_labels", None)
+        return labels
-        Examples:
-            >>> labels = {"img": np.random.rand(640, 640, 3), "cls": np.array([0, 1, 2]), "instances": Instances(...)}
-            >>> augmenter = CopyPaste(p=0.5)
-            >>> augmented_labels = augmenter(labels)
-        """
-        im = labels["img"]
-        cls = labels["cls"]
+    def _transform(self, labels1, labels2={}):
+        """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
+        im = labels1["img"]
+        cls = labels1["cls"]
         h, w = im.shape[:2]
-        instances = labels.pop("instances")
+        instances = labels1.pop("instances")
         instances.convert_bbox(format="xyxy")
         instances.denormalize(w, h)
-        if self.p and len(instances.segments):
-            _, w, _ = im.shape  # height, width, channels
-            im_new = np.zeros(im.shape, np.uint8)
-            # Calculate ioa first then select indexes randomly
-            ins_flip = deepcopy(instances)
-            ins_flip.fliplr(w)
-            ioa = bbox_ioa(ins_flip.bboxes, instances.bboxes)  # intersection over area, (N, M)
-            indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
-            n = len(indexes)
-            for j in random.sample(list(indexes), k=round(self.p * n)):
-                cls = np.concatenate((cls, cls[[j]]), axis=0)
-                instances = Instances.concatenate((instances, ins_flip[[j]]), axis=0)
-                cv2.drawContours(im_new, instances.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
-            result = cv2.flip(im, 1)  # augment segments (flip left-right)
-            i = cv2.flip(im_new, 1).astype(bool)
-            im[i] = result[i]
-        labels["img"] = im
-        labels["cls"] = cls
-        labels["instances"] = instances
-        return labels
+        im_new = np.zeros(im.shape, np.uint8)
+        instances2 = labels2.pop("instances", None)
+        if instances2 is None:
+            instances2 = deepcopy(instances)
+            instances2.fliplr(w)
+        ioa = bbox_ioa(instances2.bboxes, instances.bboxes)  # intersection over area, (N, M)
+        indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
+        n = len(indexes)
+        sorted_idx = np.argsort(ioa.max(1)[indexes])
+        indexes = indexes[sorted_idx]
+        for j in indexes[: round(self.p * n)]:
+            cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
+            instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
+            cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
+        result = labels2.get("img", cv2.flip(im, 1))  # augment segments
+        i = im_new.astype(bool)
+        im[i] = result[i]
+        labels1["img"] = im
+        labels1["cls"] = cls
+        labels1["instances"] = instances
+        return labels1
 class Albumentations:
@@ -2259,9 +2272,9 @@ class RandomLoadText:
 def v8_transforms(dataset, imgsz, hyp, stretch=False):
     """
-    Applies a series of image transformations for YOLOv8 training.
+    Applies a series of image transformations for training.
-    This function creates a composition of image augmentation techniques to prepare images for YOLOv8 training.
+    This function creates a composition of image augmentation techniques to prepare images for YOLO training.
     It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
     Args:
@@ -2280,20 +2293,28 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
         >>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
         >>> augmented_data = transforms(dataset[0])
     """
-    pre_transform = Compose(
-        [
-            Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic),
-            CopyPaste(p=hyp.copy_paste),
-            RandomPerspective(
-                degrees=hyp.degrees,
-                translate=hyp.translate,
-                scale=hyp.scale,
-                shear=hyp.shear,
-                perspective=hyp.perspective,
-                pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
-            ),
-        ]
+    mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
+    affine = RandomPerspective(
+        degrees=hyp.degrees,
+        translate=hyp.translate,
+        scale=hyp.scale,
+        shear=hyp.shear,
+        perspective=hyp.perspective,
+        pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
     )
+    pre_transform = Compose([mosaic, affine])
+    if hyp.copy_paste_mode == "flip":
+        pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
+    else:
+        pre_transform.append(
+            CopyPaste(
+                dataset,
+                pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
+                p=hyp.copy_paste,
+                mode=hyp.copy_paste_mode,
+            )
+        )
     flip_idx = dataset.data.get("flip_idx", [])  # for keypoints augmentation
     if dataset.use_keypoints:
         kpt_shape = dataset.data.get("kpt_shape", None)

ultralytics/engine/trainer.py CHANGED Viewed

@@ -538,6 +538,8 @@ class BaseTrainer:
             self.best.write_bytes(serialized_ckpt)  # save best.pt
         if (self.save_period > 0) and (self.epoch % self.save_period == 0):
             (self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt)  # save epoch, i.e. 'epoch3.pt'
+        # if self.args.close_mosaic and self.epoch == (self.epochs - self.args.close_mosaic - 1):
+        #    (self.wdir / "last_mosaic.pt").write_bytes(serialized_ckpt)  # save mosaic checkpoint
     def get_dataset(self):
         """
@@ -698,7 +700,12 @@ class BaseTrainer:
                 resume = True
                 self.args = get_cfg(ckpt_args)
                 self.args.model = self.args.resume = str(last)  # reinstate model
-                for k in "imgsz", "batch", "device":  # allow arg updates to reduce memory or update device on resume
+                for k in (
+                    "imgsz",
+                    "batch",
+                    "device",
+                    "close_mosaic",
+                ):  # allow arg updates to reduce memory or update device on resume
                     if k in overrides:
                         setattr(self.args, k, overrides[k])

ultralytics/nn/modules/__init__.py CHANGED Viewed

@@ -20,6 +20,7 @@ Example:
 from .block import (
     C1,
     C2,
+    C2PSA,
     C3,
     C3TR,
     CIB,
@@ -38,7 +39,9 @@ from .block import (
     C2f,
     C2fAttn,
     C2fCIB,
+    C2fPSA,
     C3Ghost,
+    C3k2,
     C3x,
     CBFuse,
     CBLinear,
@@ -110,6 +113,10 @@ __all__ = (
     "C2",
     "C3",
     "C2f",
+    "C3k2",
+    "SCDown",
+    "C2fPSA",
+    "C2PSA",
     "C2fAttn",
     "C3x",
     "C3TR",
@@ -149,5 +156,4 @@ __all__ = (
     "C2fCIB",
     "Attention",
     "PSA",
-    "SCDown",
 )

ultralytics/nn/modules/block.py CHANGED Viewed

@@ -40,6 +40,9 @@ __all__ = (
     "SPPELAN",
     "CBFuse",
     "CBLinear",
+    "C3k2",
+    "C2fPSA",
+    "C2PSA",
     "RepVGGDW",
     "CIB",
     "C2fCIB",
@@ -696,6 +699,49 @@ class CBFuse(nn.Module):
         return torch.sum(torch.stack(res + xs[-1:]), dim=0)
+class C3f(nn.Module):
+    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
+        """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
+        expansion.
+        """
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv((2 + n) * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
+    def forward(self, x):
+        """Forward pass through C2f layer."""
+        y = [self.cv2(x), self.cv1(x)]
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv3(torch.cat(y, 1))
+class C3k2(C2f):
+    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
+    def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
+        """Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
+        super().__init__(c1, c2, n, shortcut, g, e)
+        self.m = nn.ModuleList(
+            C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
+        )
+class C3k(C3):
+    """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
+        """Initializes the C3k module with specified channels, number of layers, and configurations."""
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)  # hidden channels
+        # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
 class RepVGGDW(torch.nn.Module):
     """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
@@ -873,25 +919,69 @@ class Attention(nn.Module):
         return x
+class PSABlock(nn.Module):
+    """
+    PSABlock class implementing a Position-Sensitive Attention block for neural networks.
+    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
+    with optional shortcut connections.
+    Attributes:
+        attn (Attention): Multi-head attention module.
+        ffn (nn.Sequential): Feed-forward neural network module.
+        add (bool): Flag indicating whether to add shortcut connections.
+    Methods:
+        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.
+    Examples:
+        Create a PSABlock and perform a forward pass
+        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
+        >>> input_tensor = torch.randn(1, 128, 32, 32)
+        >>> output_tensor = psablock(input_tensor)
+    """
+    def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
+        """Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""
+        super().__init__()
+        self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
+        self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
+        self.add = shortcut
+    def forward(self, x):
+        """Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
+        x = x + self.attn(x) if self.add else self.attn(x)
+        x = x + self.ffn(x) if self.add else self.ffn(x)
+        return x
 class PSA(nn.Module):
     """
-    Position-wise Spatial Attention module.
+    PSA class for implementing Position-Sensitive Attention in neural networks.
-    Args:
-        c1 (int): Number of input channels.
-        c2 (int): Number of output channels.
-        e (float): Expansion factor for the intermediate channels. Default is 0.5.
+    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
+    input tensors, enhancing feature extraction and processing capabilities.
     Attributes:
-        c (int): Number of intermediate channels.
+        c (int): Number of hidden channels after applying the initial convolution.
         cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
         cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
-        attn (Attention): Attention module for spatial attention.
-        ffn (nn.Sequential): Feed-forward network module.
+        attn (Attention): Attention module for position-sensitive attention.
+        ffn (nn.Sequential): Feed-forward network for further processing.
+    Methods:
+        forward: Applies position-sensitive attention and feed-forward network to the input tensor.
+    Examples:
+        Create a PSA module and apply it to an input tensor
+        >>> psa = PSA(c1=128, c2=128, e=0.5)
+        >>> input_tensor = torch.randn(1, 128, 64, 64)
+        >>> output_tensor = psa.forward(input_tensor)
     """
     def __init__(self, c1, c2, e=0.5):
-        """Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
+        """Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
         super().__init__()
         assert c1 == c2
         self.c = int(c1 * e)
@@ -902,46 +992,117 @@ class PSA(nn.Module):
         self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
     def forward(self, x):
-        """
-        Forward pass of the PSA module.
-        Args:
-            x (torch.Tensor): Input tensor.
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
+        """Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor."""
         a, b = self.cv1(x).split((self.c, self.c), dim=1)
         b = b + self.attn(b)
         b = b + self.ffn(b)
         return self.cv2(torch.cat((a, b), 1))
+class C2PSA(nn.Module):
+    """
+    C2PSA module with attention mechanism for enhanced feature extraction and processing.
+    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
+    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
+    Attributes:
+        c (int): Number of hidden channels.
+        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
+        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
+        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.
+    Methods:
+        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
+    Notes:
+        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
+    Examples:
+        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
+        >>> input_tensor = torch.randn(1, 256, 64, 64)
+        >>> output_tensor = c2psa(input_tensor)
+    """
+    def __init__(self, c1, c2, n=1, e=0.5):
+        """Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
+        super().__init__()
+        assert c1 == c2
+        self.c = int(c1 * e)
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv(2 * self.c, c1, 1)
+        self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
+    def forward(self, x):
+        """Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
+        a, b = self.cv1(x).split((self.c, self.c), dim=1)
+        b = self.m(b)
+        return self.cv2(torch.cat((a, b), 1))
+class C2fPSA(C2f):
+    """
+    C2fPSA module with enhanced feature extraction using PSA blocks.
+    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
+    Attributes:
+        c (int): Number of hidden channels.
+        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
+        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
+        m (nn.ModuleList): List of PSA blocks for feature extraction.
+    Methods:
+        forward: Performs a forward pass through the C2fPSA module.
+        forward_split: Performs a forward pass using split() instead of chunk().
+    Examples:
+        >>> import torch
+        >>> from ultralytics.models.common import C2fPSA
+        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
+        >>> x = torch.randn(1, 64, 128, 128)
+        >>> output = model(x)
+        >>> print(output.shape)
+    """
+    def __init__(self, c1, c2, n=1, e=0.5):
+        """Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
+        assert c1 == c2
+        super().__init__(c1, c2, n=n, e=e)
+        self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
 class SCDown(nn.Module):
-    """Spatial Channel Downsample (SCDown) module for reducing spatial and channel dimensions."""
+    """
+    SCDown module for downsampling with separable convolutions.
-    def __init__(self, c1, c2, k, s):
-        """
-        Spatial Channel Downsample (SCDown) module.
+    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
+    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size for the convolutional layer.
-            s (int): Stride for the convolutional layer.
-        """
+    Attributes:
+        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
+        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.
+    Methods:
+        forward: Applies the SCDown module to the input tensor.
+    Examples:
+        >>> import torch
+        >>> from ultralytics import SCDown
+        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
+        >>> x = torch.randn(1, 64, 128, 128)
+        >>> y = model(x)
+        >>> print(y.shape)
+        torch.Size([1, 128, 64, 64])
+    """
+    def __init__(self, c1, c2, k, s):
+        """Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
         super().__init__()
         self.cv1 = Conv(c1, c2, 1, 1)
         self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
     def forward(self, x):
-        """
-        Forward pass of the SCDown module.
-        Args:
-            x (torch.Tensor): Input tensor.
-        Returns:
-            (torch.Tensor): Output tensor after applying the SCDown module.
-        """
+        """Applies convolution and downsampling to the input tensor in the SCDown module."""
         return self.cv2(self.cv1(x))

ultralytics/nn/modules/conv.py CHANGED Viewed

@@ -209,7 +209,8 @@ class RepConv(nn.Module):
         kernelid, biasid = self._fuse_bn_tensor(self.bn)
         return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
-    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+    @staticmethod
+    def _pad_1x1_to_3x3_tensor(kernel1x1):
         """Pads a 1x1 tensor to a 3x3 tensor."""
         if kernel1x1 is None:
             return 0

ultralytics/nn/modules/head.py CHANGED Viewed

@@ -11,7 +11,7 @@ from torch.nn.init import constant_, xavier_uniform_
 from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
 from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
-from .conv import Conv
+from .conv import Conv, DWConv
 from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
 from .utils import bias_init_with_prob, linear_init
@@ -41,7 +41,14 @@ class Detect(nn.Module):
         self.cv2 = nn.ModuleList(
             nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
         )
-        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
+        self.cv3 = nn.ModuleList(
+            nn.Sequential(
+                nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
+                nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
+                nn.Conv2d(c3, self.nc, 1),
+            )
+            for x in ch
+        )
         self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
         if self.end2end:

ultralytics 8.2.103__py3-none-any.whl → 8.3.0__py3-none-any.whl

Potentially problematic release.

ultralytics 8.2.103py3-none-any.whl → 8.3.0py3-none-any.whl