ultralytics 8.2.102__py3-none-any.whl → 8.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultralytics might be problematic. Click here for more details.

Files changed (35) hide show
  1. tests/__init__.py +2 -2
  2. tests/conftest.py +1 -1
  3. tests/test_cuda.py +1 -1
  4. tests/test_engine.py +5 -5
  5. tests/test_explorer.py +3 -3
  6. tests/test_exports.py +1 -2
  7. tests/test_integrations.py +9 -9
  8. tests/test_python.py +11 -11
  9. tests/test_solutions.py +3 -3
  10. ultralytics/__init__.py +1 -2
  11. ultralytics/cfg/datasets/hand-keypoints.yaml +2 -2
  12. ultralytics/cfg/default.yaml +1 -0
  13. ultralytics/cfg/models/11/yolo11-cls.yaml +30 -0
  14. ultralytics/cfg/models/11/yolo11-obb.yaml +47 -0
  15. ultralytics/cfg/models/11/yolo11-pose.yaml +48 -0
  16. ultralytics/cfg/models/11/yolo11-seg.yaml +47 -0
  17. ultralytics/cfg/models/11/yolo11.yaml +47 -0
  18. ultralytics/data/augment.py +101 -80
  19. ultralytics/engine/trainer.py +8 -1
  20. ultralytics/nn/modules/__init__.py +7 -1
  21. ultralytics/nn/modules/block.py +198 -37
  22. ultralytics/nn/modules/conv.py +2 -1
  23. ultralytics/nn/modules/head.py +9 -2
  24. ultralytics/nn/tasks.py +25 -2
  25. ultralytics/utils/__init__.py +9 -2
  26. ultralytics/utils/benchmarks.py +103 -66
  27. ultralytics/utils/downloads.py +3 -2
  28. ultralytics/utils/loss.py +3 -0
  29. ultralytics/utils/torch_utils.py +1 -1
  30. {ultralytics-8.2.102.dist-info → ultralytics-8.3.0.dist-info}/METADATA +51 -65
  31. {ultralytics-8.2.102.dist-info → ultralytics-8.3.0.dist-info}/RECORD +35 -30
  32. {ultralytics-8.2.102.dist-info → ultralytics-8.3.0.dist-info}/LICENSE +0 -0
  33. {ultralytics-8.2.102.dist-info → ultralytics-8.3.0.dist-info}/WHEEL +0 -0
  34. {ultralytics-8.2.102.dist-info → ultralytics-8.3.0.dist-info}/entry_points.txt +0 -0
  35. {ultralytics-8.2.102.dist-info → ultralytics-8.3.0.dist-info}/top_level.txt +0 -0
@@ -1628,92 +1628,105 @@ class LetterBox:
1628
1628
  return labels
1629
1629
 
1630
1630
 
1631
- class CopyPaste:
1631
+ class CopyPaste(BaseMixTransform):
1632
1632
  """
1633
- Implements Copy-Paste augmentation as described in https://arxiv.org/abs/2012.07177.
1633
+ CopyPaste class for applying Copy-Paste augmentation to image datasets.
1634
1634
 
1635
- This class applies Copy-Paste augmentation on images and their corresponding instances.
1635
+ This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
1636
+ Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
1637
+ different images to create new training samples.
1636
1638
 
1637
1639
  Attributes:
1638
- p (float): Probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
1640
+ dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
1641
+ pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
1642
+ p (float): Probability of applying Copy-Paste augmentation.
1639
1643
 
1640
1644
  Methods:
1641
- __call__: Applies Copy-Paste augmentation to given image and instances.
1645
+ get_indexes: Returns a random index from the dataset.
1646
+ _mix_transform: Applies Copy-Paste augmentation to the input labels.
1647
+ __call__: Applies the Copy-Paste transformation to images and annotations.
1642
1648
 
1643
1649
  Examples:
1644
- >>> copypaste = CopyPaste(p=0.5)
1645
- >>> augmented_labels = copypaste(labels)
1646
- >>> augmented_image = augmented_labels["img"]
1650
+ >>> from ultralytics.data.augment import CopyPaste
1651
+ >>> dataset = YourDataset(...) # Your image dataset
1652
+ >>> copypaste = CopyPaste(dataset, p=0.5)
1653
+ >>> augmented_labels = copypaste(original_labels)
1647
1654
  """
1648
1655
 
1649
- def __init__(self, p=0.5) -> None:
1650
- """
1651
- Initializes the CopyPaste augmentation object.
1656
+ def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
1657
+ """Initializes CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
1658
+ super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
1659
+ assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
1660
+ self.mode = mode
1652
1661
 
1653
- This class implements the Copy-Paste augmentation as described in the paper "Simple Copy-Paste is a Strong Data
1654
- Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It applies the Copy-Paste
1655
- augmentation on images and their corresponding instances with a given probability.
1662
+ def get_indexes(self):
1663
+ """Returns a list of random indexes from the dataset for CopyPaste augmentation."""
1664
+ return random.randint(0, len(self.dataset) - 1)
1656
1665
 
1657
- Args:
1658
- p (float): The probability of applying the Copy-Paste augmentation. Must be between 0 and 1.
1666
+ def _mix_transform(self, labels):
1667
+ """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
1668
+ labels2 = labels["mix_labels"][0]
1669
+ return self._transform(labels, labels2)
1659
1670
 
1660
- Attributes:
1661
- p (float): Stores the probability of applying the augmentation.
1671
+ def __call__(self, labels):
1672
+ """Applies Copy-Paste augmentation to an image and its labels."""
1673
+ if len(labels["instances"].segments) == 0 or self.p == 0:
1674
+ return labels
1675
+ if self.mode == "flip":
1676
+ return self._transform(labels)
1662
1677
 
1663
- Examples:
1664
- >>> augment = CopyPaste(p=0.7)
1665
- >>> augmented_data = augment(original_data)
1666
- """
1667
- self.p = p
1678
+ # Get index of one or three other images
1679
+ indexes = self.get_indexes()
1680
+ if isinstance(indexes, int):
1681
+ indexes = [indexes]
1668
1682
 
1669
- def __call__(self, labels):
1670
- """
1671
- Applies Copy-Paste augmentation to an image and its instances.
1683
+ # Get images information will be used for Mosaic or MixUp
1684
+ mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
1672
1685
 
1673
- Args:
1674
- labels (Dict): A dictionary containing:
1675
- - 'img' (np.ndarray): The image to augment.
1676
- - 'cls' (np.ndarray): Class labels for the instances.
1677
- - 'instances' (ultralytics.engine.results.Instances): Object containing bounding boxes, segments, etc.
1686
+ if self.pre_transform is not None:
1687
+ for i, data in enumerate(mix_labels):
1688
+ mix_labels[i] = self.pre_transform(data)
1689
+ labels["mix_labels"] = mix_labels
1678
1690
 
1679
- Returns:
1680
- (Dict): Dictionary with augmented image and updated instances under 'img', 'cls', and 'instances' keys.
1691
+ # Update cls and texts
1692
+ labels = self._update_label_text(labels)
1693
+ # Mosaic or MixUp
1694
+ labels = self._mix_transform(labels)
1695
+ labels.pop("mix_labels", None)
1696
+ return labels
1681
1697
 
1682
- Examples:
1683
- >>> labels = {"img": np.random.rand(640, 640, 3), "cls": np.array([0, 1, 2]), "instances": Instances(...)}
1684
- >>> augmenter = CopyPaste(p=0.5)
1685
- >>> augmented_labels = augmenter(labels)
1686
- """
1687
- im = labels["img"]
1688
- cls = labels["cls"]
1698
+ def _transform(self, labels1, labels2={}):
1699
+ """Applies Copy-Paste augmentation to combine objects from another image into the current image."""
1700
+ im = labels1["img"]
1701
+ cls = labels1["cls"]
1689
1702
  h, w = im.shape[:2]
1690
- instances = labels.pop("instances")
1703
+ instances = labels1.pop("instances")
1691
1704
  instances.convert_bbox(format="xyxy")
1692
1705
  instances.denormalize(w, h)
1693
- if self.p and len(instances.segments):
1694
- _, w, _ = im.shape # height, width, channels
1695
- im_new = np.zeros(im.shape, np.uint8)
1696
-
1697
- # Calculate ioa first then select indexes randomly
1698
- ins_flip = deepcopy(instances)
1699
- ins_flip.fliplr(w)
1700
-
1701
- ioa = bbox_ioa(ins_flip.bboxes, instances.bboxes) # intersection over area, (N, M)
1702
- indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, )
1703
- n = len(indexes)
1704
- for j in random.sample(list(indexes), k=round(self.p * n)):
1705
- cls = np.concatenate((cls, cls[[j]]), axis=0)
1706
- instances = Instances.concatenate((instances, ins_flip[[j]]), axis=0)
1707
- cv2.drawContours(im_new, instances.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
1708
-
1709
- result = cv2.flip(im, 1) # augment segments (flip left-right)
1710
- i = cv2.flip(im_new, 1).astype(bool)
1711
- im[i] = result[i]
1712
-
1713
- labels["img"] = im
1714
- labels["cls"] = cls
1715
- labels["instances"] = instances
1716
- return labels
1706
+
1707
+ im_new = np.zeros(im.shape, np.uint8)
1708
+ instances2 = labels2.pop("instances", None)
1709
+ if instances2 is None:
1710
+ instances2 = deepcopy(instances)
1711
+ instances2.fliplr(w)
1712
+ ioa = bbox_ioa(instances2.bboxes, instances.bboxes) # intersection over area, (N, M)
1713
+ indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, )
1714
+ n = len(indexes)
1715
+ sorted_idx = np.argsort(ioa.max(1)[indexes])
1716
+ indexes = indexes[sorted_idx]
1717
+ for j in indexes[: round(self.p * n)]:
1718
+ cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
1719
+ instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
1720
+ cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
1721
+
1722
+ result = labels2.get("img", cv2.flip(im, 1)) # augment segments
1723
+ i = im_new.astype(bool)
1724
+ im[i] = result[i]
1725
+
1726
+ labels1["img"] = im
1727
+ labels1["cls"] = cls
1728
+ labels1["instances"] = instances
1729
+ return labels1
1717
1730
 
1718
1731
 
1719
1732
  class Albumentations:
@@ -2259,9 +2272,9 @@ class RandomLoadText:
2259
2272
 
2260
2273
  def v8_transforms(dataset, imgsz, hyp, stretch=False):
2261
2274
  """
2262
- Applies a series of image transformations for YOLOv8 training.
2275
+ Applies a series of image transformations for training.
2263
2276
 
2264
- This function creates a composition of image augmentation techniques to prepare images for YOLOv8 training.
2277
+ This function creates a composition of image augmentation techniques to prepare images for YOLO training.
2265
2278
  It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
2266
2279
 
2267
2280
  Args:
@@ -2280,20 +2293,28 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
2280
2293
  >>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
2281
2294
  >>> augmented_data = transforms(dataset[0])
2282
2295
  """
2283
- pre_transform = Compose(
2284
- [
2285
- Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic),
2286
- CopyPaste(p=hyp.copy_paste),
2287
- RandomPerspective(
2288
- degrees=hyp.degrees,
2289
- translate=hyp.translate,
2290
- scale=hyp.scale,
2291
- shear=hyp.shear,
2292
- perspective=hyp.perspective,
2293
- pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
2294
- ),
2295
- ]
2296
+ mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
2297
+ affine = RandomPerspective(
2298
+ degrees=hyp.degrees,
2299
+ translate=hyp.translate,
2300
+ scale=hyp.scale,
2301
+ shear=hyp.shear,
2302
+ perspective=hyp.perspective,
2303
+ pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
2296
2304
  )
2305
+
2306
+ pre_transform = Compose([mosaic, affine])
2307
+ if hyp.copy_paste_mode == "flip":
2308
+ pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
2309
+ else:
2310
+ pre_transform.append(
2311
+ CopyPaste(
2312
+ dataset,
2313
+ pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
2314
+ p=hyp.copy_paste,
2315
+ mode=hyp.copy_paste_mode,
2316
+ )
2317
+ )
2297
2318
  flip_idx = dataset.data.get("flip_idx", []) # for keypoints augmentation
2298
2319
  if dataset.use_keypoints:
2299
2320
  kpt_shape = dataset.data.get("kpt_shape", None)
@@ -538,6 +538,8 @@ class BaseTrainer:
538
538
  self.best.write_bytes(serialized_ckpt) # save best.pt
539
539
  if (self.save_period > 0) and (self.epoch % self.save_period == 0):
540
540
  (self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt) # save epoch, i.e. 'epoch3.pt'
541
+ # if self.args.close_mosaic and self.epoch == (self.epochs - self.args.close_mosaic - 1):
542
+ # (self.wdir / "last_mosaic.pt").write_bytes(serialized_ckpt) # save mosaic checkpoint
541
543
 
542
544
  def get_dataset(self):
543
545
  """
@@ -698,7 +700,12 @@ class BaseTrainer:
698
700
  resume = True
699
701
  self.args = get_cfg(ckpt_args)
700
702
  self.args.model = self.args.resume = str(last) # reinstate model
701
- for k in "imgsz", "batch", "device": # allow arg updates to reduce memory or update device on resume
703
+ for k in (
704
+ "imgsz",
705
+ "batch",
706
+ "device",
707
+ "close_mosaic",
708
+ ): # allow arg updates to reduce memory or update device on resume
702
709
  if k in overrides:
703
710
  setattr(self.args, k, overrides[k])
704
711
 
@@ -20,6 +20,7 @@ Example:
20
20
  from .block import (
21
21
  C1,
22
22
  C2,
23
+ C2PSA,
23
24
  C3,
24
25
  C3TR,
25
26
  CIB,
@@ -38,7 +39,9 @@ from .block import (
38
39
  C2f,
39
40
  C2fAttn,
40
41
  C2fCIB,
42
+ C2fPSA,
41
43
  C3Ghost,
44
+ C3k2,
42
45
  C3x,
43
46
  CBFuse,
44
47
  CBLinear,
@@ -110,6 +113,10 @@ __all__ = (
110
113
  "C2",
111
114
  "C3",
112
115
  "C2f",
116
+ "C3k2",
117
+ "SCDown",
118
+ "C2fPSA",
119
+ "C2PSA",
113
120
  "C2fAttn",
114
121
  "C3x",
115
122
  "C3TR",
@@ -149,5 +156,4 @@ __all__ = (
149
156
  "C2fCIB",
150
157
  "Attention",
151
158
  "PSA",
152
- "SCDown",
153
159
  )
@@ -40,6 +40,9 @@ __all__ = (
40
40
  "SPPELAN",
41
41
  "CBFuse",
42
42
  "CBLinear",
43
+ "C3k2",
44
+ "C2fPSA",
45
+ "C2PSA",
43
46
  "RepVGGDW",
44
47
  "CIB",
45
48
  "C2fCIB",
@@ -696,6 +699,49 @@ class CBFuse(nn.Module):
696
699
  return torch.sum(torch.stack(res + xs[-1:]), dim=0)
697
700
 
698
701
 
702
+ class C3f(nn.Module):
703
+ """Faster Implementation of CSP Bottleneck with 2 convolutions."""
704
+
705
+ def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
706
+ """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
707
+ expansion.
708
+ """
709
+ super().__init__()
710
+ c_ = int(c2 * e) # hidden channels
711
+ self.cv1 = Conv(c1, c_, 1, 1)
712
+ self.cv2 = Conv(c1, c_, 1, 1)
713
+ self.cv3 = Conv((2 + n) * c_, c2, 1) # optional act=FReLU(c2)
714
+ self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
715
+
716
+ def forward(self, x):
717
+ """Forward pass through C2f layer."""
718
+ y = [self.cv2(x), self.cv1(x)]
719
+ y.extend(m(y[-1]) for m in self.m)
720
+ return self.cv3(torch.cat(y, 1))
721
+
722
+
723
+ class C3k2(C2f):
724
+ """Faster Implementation of CSP Bottleneck with 2 convolutions."""
725
+
726
+ def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
727
+ """Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
728
+ super().__init__(c1, c2, n, shortcut, g, e)
729
+ self.m = nn.ModuleList(
730
+ C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
731
+ )
732
+
733
+
734
+ class C3k(C3):
735
+ """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
736
+
737
+ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
738
+ """Initializes the C3k module with specified channels, number of layers, and configurations."""
739
+ super().__init__(c1, c2, n, shortcut, g, e)
740
+ c_ = int(c2 * e) # hidden channels
741
+ # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
742
+ self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
743
+
744
+
699
745
  class RepVGGDW(torch.nn.Module):
700
746
  """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
701
747
 
@@ -873,25 +919,69 @@ class Attention(nn.Module):
873
919
  return x
874
920
 
875
921
 
922
+ class PSABlock(nn.Module):
923
+ """
924
+ PSABlock class implementing a Position-Sensitive Attention block for neural networks.
925
+
926
+ This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
927
+ with optional shortcut connections.
928
+
929
+ Attributes:
930
+ attn (Attention): Multi-head attention module.
931
+ ffn (nn.Sequential): Feed-forward neural network module.
932
+ add (bool): Flag indicating whether to add shortcut connections.
933
+
934
+ Methods:
935
+ forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.
936
+
937
+ Examples:
938
+ Create a PSABlock and perform a forward pass
939
+ >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
940
+ >>> input_tensor = torch.randn(1, 128, 32, 32)
941
+ >>> output_tensor = psablock(input_tensor)
942
+ """
943
+
944
+ def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
945
+ """Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""
946
+ super().__init__()
947
+
948
+ self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
949
+ self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
950
+ self.add = shortcut
951
+
952
+ def forward(self, x):
953
+ """Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
954
+ x = x + self.attn(x) if self.add else self.attn(x)
955
+ x = x + self.ffn(x) if self.add else self.ffn(x)
956
+ return x
957
+
958
+
876
959
  class PSA(nn.Module):
877
960
  """
878
- Position-wise Spatial Attention module.
961
+ PSA class for implementing Position-Sensitive Attention in neural networks.
879
962
 
880
- Args:
881
- c1 (int): Number of input channels.
882
- c2 (int): Number of output channels.
883
- e (float): Expansion factor for the intermediate channels. Default is 0.5.
963
+ This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
964
+ input tensors, enhancing feature extraction and processing capabilities.
884
965
 
885
966
  Attributes:
886
- c (int): Number of intermediate channels.
967
+ c (int): Number of hidden channels after applying the initial convolution.
887
968
  cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
888
969
  cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
889
- attn (Attention): Attention module for spatial attention.
890
- ffn (nn.Sequential): Feed-forward network module.
970
+ attn (Attention): Attention module for position-sensitive attention.
971
+ ffn (nn.Sequential): Feed-forward network for further processing.
972
+
973
+ Methods:
974
+ forward: Applies position-sensitive attention and feed-forward network to the input tensor.
975
+
976
+ Examples:
977
+ Create a PSA module and apply it to an input tensor
978
+ >>> psa = PSA(c1=128, c2=128, e=0.5)
979
+ >>> input_tensor = torch.randn(1, 128, 64, 64)
980
+ >>> output_tensor = psa.forward(input_tensor)
891
981
  """
892
982
 
893
983
  def __init__(self, c1, c2, e=0.5):
894
- """Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
984
+ """Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
895
985
  super().__init__()
896
986
  assert c1 == c2
897
987
  self.c = int(c1 * e)
@@ -902,46 +992,117 @@ class PSA(nn.Module):
902
992
  self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
903
993
 
904
994
  def forward(self, x):
905
- """
906
- Forward pass of the PSA module.
907
-
908
- Args:
909
- x (torch.Tensor): Input tensor.
910
-
911
- Returns:
912
- (torch.Tensor): Output tensor.
913
- """
995
+ """Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor."""
914
996
  a, b = self.cv1(x).split((self.c, self.c), dim=1)
915
997
  b = b + self.attn(b)
916
998
  b = b + self.ffn(b)
917
999
  return self.cv2(torch.cat((a, b), 1))
918
1000
 
919
1001
 
1002
+ class C2PSA(nn.Module):
1003
+ """
1004
+ C2PSA module with attention mechanism for enhanced feature extraction and processing.
1005
+
1006
+ This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
1007
+ capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
1008
+
1009
+ Attributes:
1010
+ c (int): Number of hidden channels.
1011
+ cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
1012
+ cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
1013
+ m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.
1014
+
1015
+ Methods:
1016
+ forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
1017
+
1018
+ Notes:
1019
+ This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
1020
+
1021
+ Examples:
1022
+ >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
1023
+ >>> input_tensor = torch.randn(1, 256, 64, 64)
1024
+ >>> output_tensor = c2psa(input_tensor)
1025
+ """
1026
+
1027
+ def __init__(self, c1, c2, n=1, e=0.5):
1028
+ """Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
1029
+ super().__init__()
1030
+ assert c1 == c2
1031
+ self.c = int(c1 * e)
1032
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
1033
+ self.cv2 = Conv(2 * self.c, c1, 1)
1034
+
1035
+ self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
1036
+
1037
+ def forward(self, x):
1038
+ """Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
1039
+ a, b = self.cv1(x).split((self.c, self.c), dim=1)
1040
+ b = self.m(b)
1041
+ return self.cv2(torch.cat((a, b), 1))
1042
+
1043
+
1044
+ class C2fPSA(C2f):
1045
+ """
1046
+ C2fPSA module with enhanced feature extraction using PSA blocks.
1047
+
1048
+ This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
1049
+
1050
+ Attributes:
1051
+ c (int): Number of hidden channels.
1052
+ cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
1053
+ cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
1054
+ m (nn.ModuleList): List of PSA blocks for feature extraction.
1055
+
1056
+ Methods:
1057
+ forward: Performs a forward pass through the C2fPSA module.
1058
+ forward_split: Performs a forward pass using split() instead of chunk().
1059
+
1060
+ Examples:
1061
+ >>> import torch
1062
+ >>> from ultralytics.models.common import C2fPSA
1063
+ >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
1064
+ >>> x = torch.randn(1, 64, 128, 128)
1065
+ >>> output = model(x)
1066
+ >>> print(output.shape)
1067
+ """
1068
+
1069
+ def __init__(self, c1, c2, n=1, e=0.5):
1070
+ """Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
1071
+ assert c1 == c2
1072
+ super().__init__(c1, c2, n=n, e=e)
1073
+ self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
1074
+
1075
+
920
1076
  class SCDown(nn.Module):
921
- """Spatial Channel Downsample (SCDown) module for reducing spatial and channel dimensions."""
1077
+ """
1078
+ SCDown module for downsampling with separable convolutions.
922
1079
 
923
- def __init__(self, c1, c2, k, s):
924
- """
925
- Spatial Channel Downsample (SCDown) module.
1080
+ This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
1081
+ efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
926
1082
 
927
- Args:
928
- c1 (int): Number of input channels.
929
- c2 (int): Number of output channels.
930
- k (int): Kernel size for the convolutional layer.
931
- s (int): Stride for the convolutional layer.
932
- """
1083
+ Attributes:
1084
+ cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
1085
+ cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.
1086
+
1087
+ Methods:
1088
+ forward: Applies the SCDown module to the input tensor.
1089
+
1090
+ Examples:
1091
+ >>> import torch
1092
+ >>> from ultralytics import SCDown
1093
+ >>> model = SCDown(c1=64, c2=128, k=3, s=2)
1094
+ >>> x = torch.randn(1, 64, 128, 128)
1095
+ >>> y = model(x)
1096
+ >>> print(y.shape)
1097
+ torch.Size([1, 128, 64, 64])
1098
+ """
1099
+
1100
+ def __init__(self, c1, c2, k, s):
1101
+ """Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
933
1102
  super().__init__()
934
1103
  self.cv1 = Conv(c1, c2, 1, 1)
935
1104
  self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
936
1105
 
937
1106
  def forward(self, x):
938
- """
939
- Forward pass of the SCDown module.
940
-
941
- Args:
942
- x (torch.Tensor): Input tensor.
943
-
944
- Returns:
945
- (torch.Tensor): Output tensor after applying the SCDown module.
946
- """
1107
+ """Applies convolution and downsampling to the input tensor in the SCDown module."""
947
1108
  return self.cv2(self.cv1(x))
@@ -209,7 +209,8 @@ class RepConv(nn.Module):
209
209
  kernelid, biasid = self._fuse_bn_tensor(self.bn)
210
210
  return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
211
211
 
212
- def _pad_1x1_to_3x3_tensor(self, kernel1x1):
212
+ @staticmethod
213
+ def _pad_1x1_to_3x3_tensor(kernel1x1):
213
214
  """Pads a 1x1 tensor to a 3x3 tensor."""
214
215
  if kernel1x1 is None:
215
216
  return 0
@@ -11,7 +11,7 @@ from torch.nn.init import constant_, xavier_uniform_
11
11
  from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
12
12
 
13
13
  from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
14
- from .conv import Conv
14
+ from .conv import Conv, DWConv
15
15
  from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
16
16
  from .utils import bias_init_with_prob, linear_init
17
17
 
@@ -41,7 +41,14 @@ class Detect(nn.Module):
41
41
  self.cv2 = nn.ModuleList(
42
42
  nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
43
43
  )
44
- self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
44
+ self.cv3 = nn.ModuleList(
45
+ nn.Sequential(
46
+ nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
47
+ nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
48
+ nn.Conv2d(c3, self.nc, 1),
49
+ )
50
+ for x in ch
51
+ )
45
52
  self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
46
53
 
47
54
  if self.end2end: