ultralytics 8.2.103__py3-none-any.whl → 8.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultralytics might be problematic. Click here for more details.
- tests/__init__.py +2 -2
- tests/conftest.py +1 -1
- tests/test_cuda.py +1 -1
- tests/test_engine.py +5 -5
- tests/test_explorer.py +3 -3
- tests/test_exports.py +1 -2
- tests/test_integrations.py +9 -9
- tests/test_python.py +11 -11
- tests/test_solutions.py +3 -3
- ultralytics/__init__.py +1 -2
- ultralytics/cfg/datasets/hand-keypoints.yaml +2 -2
- ultralytics/cfg/default.yaml +1 -0
- ultralytics/cfg/models/11/yolo11-cls.yaml +30 -0
- ultralytics/cfg/models/11/yolo11-obb.yaml +47 -0
- ultralytics/cfg/models/11/yolo11-pose.yaml +48 -0
- ultralytics/cfg/models/11/yolo11-seg.yaml +47 -0
- ultralytics/cfg/models/11/yolo11.yaml +47 -0
- ultralytics/data/augment.py +101 -80
- ultralytics/engine/trainer.py +8 -1
- ultralytics/nn/modules/__init__.py +7 -1
- ultralytics/nn/modules/block.py +198 -37
- ultralytics/nn/modules/conv.py +2 -1
- ultralytics/nn/modules/head.py +9 -2
- ultralytics/nn/tasks.py +25 -2
- ultralytics/utils/downloads.py +3 -2
- ultralytics/utils/loss.py +3 -0
- ultralytics/utils/torch_utils.py +1 -1
- {ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/METADATA +51 -65
- {ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/RECORD +33 -28
- {ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/LICENSE +0 -0
- {ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/WHEEL +0 -0
- {ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.2.103.dist-info → ultralytics-8.3.0.dist-info}/top_level.txt +0 -0
ultralytics/data/augment.py
CHANGED
|
@@ -1628,92 +1628,105 @@ class LetterBox:
|
|
|
1628
1628
|
return labels
|
|
1629
1629
|
|
|
1630
1630
|
|
|
1631
|
-
class CopyPaste:
|
|
1631
|
+
class CopyPaste(BaseMixTransform):
|
|
1632
1632
|
"""
|
|
1633
|
-
|
|
1633
|
+
CopyPaste class for applying Copy-Paste augmentation to image datasets.
|
|
1634
1634
|
|
|
1635
|
-
This class
|
|
1635
|
+
This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
|
|
1636
|
+
Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
|
|
1637
|
+
different images to create new training samples.
|
|
1636
1638
|
|
|
1637
1639
|
Attributes:
|
|
1638
|
-
|
|
1640
|
+
dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
|
|
1641
|
+
pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
|
|
1642
|
+
p (float): Probability of applying Copy-Paste augmentation.
|
|
1639
1643
|
|
|
1640
1644
|
Methods:
|
|
1641
|
-
|
|
1645
|
+
get_indexes: Returns a random index from the dataset.
|
|
1646
|
+
_mix_transform: Applies Copy-Paste augmentation to the input labels.
|
|
1647
|
+
__call__: Applies the Copy-Paste transformation to images and annotations.
|
|
1642
1648
|
|
|
1643
1649
|
Examples:
|
|
1644
|
-
>>>
|
|
1645
|
-
>>>
|
|
1646
|
-
>>>
|
|
1650
|
+
>>> from ultralytics.data.augment import CopyPaste
|
|
1651
|
+
>>> dataset = YourDataset(...) # Your image dataset
|
|
1652
|
+
>>> copypaste = CopyPaste(dataset, p=0.5)
|
|
1653
|
+
>>> augmented_labels = copypaste(original_labels)
|
|
1647
1654
|
"""
|
|
1648
1655
|
|
|
1649
|
-
def __init__(self, p=0.5) -> None:
|
|
1650
|
-
"""
|
|
1651
|
-
|
|
1656
|
+
def __init__(self, dataset=None, pre_transform=None, p=0.5, mode="flip") -> None:
|
|
1657
|
+
"""Initializes CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
|
|
1658
|
+
super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
|
|
1659
|
+
assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
|
|
1660
|
+
self.mode = mode
|
|
1652
1661
|
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1662
|
+
def get_indexes(self):
|
|
1663
|
+
"""Returns a list of random indexes from the dataset for CopyPaste augmentation."""
|
|
1664
|
+
return random.randint(0, len(self.dataset) - 1)
|
|
1656
1665
|
|
|
1657
|
-
|
|
1658
|
-
|
|
1666
|
+
def _mix_transform(self, labels):
|
|
1667
|
+
"""Applies Copy-Paste augmentation to combine objects from another image into the current image."""
|
|
1668
|
+
labels2 = labels["mix_labels"][0]
|
|
1669
|
+
return self._transform(labels, labels2)
|
|
1659
1670
|
|
|
1660
|
-
|
|
1661
|
-
|
|
1671
|
+
def __call__(self, labels):
|
|
1672
|
+
"""Applies Copy-Paste augmentation to an image and its labels."""
|
|
1673
|
+
if len(labels["instances"].segments) == 0 or self.p == 0:
|
|
1674
|
+
return labels
|
|
1675
|
+
if self.mode == "flip":
|
|
1676
|
+
return self._transform(labels)
|
|
1662
1677
|
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
self.p = p
|
|
1678
|
+
# Get index of one or three other images
|
|
1679
|
+
indexes = self.get_indexes()
|
|
1680
|
+
if isinstance(indexes, int):
|
|
1681
|
+
indexes = [indexes]
|
|
1668
1682
|
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
Applies Copy-Paste augmentation to an image and its instances.
|
|
1683
|
+
# Get images information will be used for Mosaic or MixUp
|
|
1684
|
+
mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
|
|
1672
1685
|
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
- 'instances' (ultralytics.engine.results.Instances): Object containing bounding boxes, segments, etc.
|
|
1686
|
+
if self.pre_transform is not None:
|
|
1687
|
+
for i, data in enumerate(mix_labels):
|
|
1688
|
+
mix_labels[i] = self.pre_transform(data)
|
|
1689
|
+
labels["mix_labels"] = mix_labels
|
|
1678
1690
|
|
|
1679
|
-
|
|
1680
|
-
|
|
1691
|
+
# Update cls and texts
|
|
1692
|
+
labels = self._update_label_text(labels)
|
|
1693
|
+
# Mosaic or MixUp
|
|
1694
|
+
labels = self._mix_transform(labels)
|
|
1695
|
+
labels.pop("mix_labels", None)
|
|
1696
|
+
return labels
|
|
1681
1697
|
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
"""
|
|
1687
|
-
im = labels["img"]
|
|
1688
|
-
cls = labels["cls"]
|
|
1698
|
+
def _transform(self, labels1, labels2={}):
|
|
1699
|
+
"""Applies Copy-Paste augmentation to combine objects from another image into the current image."""
|
|
1700
|
+
im = labels1["img"]
|
|
1701
|
+
cls = labels1["cls"]
|
|
1689
1702
|
h, w = im.shape[:2]
|
|
1690
|
-
instances =
|
|
1703
|
+
instances = labels1.pop("instances")
|
|
1691
1704
|
instances.convert_bbox(format="xyxy")
|
|
1692
1705
|
instances.denormalize(w, h)
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
return
|
|
1706
|
+
|
|
1707
|
+
im_new = np.zeros(im.shape, np.uint8)
|
|
1708
|
+
instances2 = labels2.pop("instances", None)
|
|
1709
|
+
if instances2 is None:
|
|
1710
|
+
instances2 = deepcopy(instances)
|
|
1711
|
+
instances2.fliplr(w)
|
|
1712
|
+
ioa = bbox_ioa(instances2.bboxes, instances.bboxes) # intersection over area, (N, M)
|
|
1713
|
+
indexes = np.nonzero((ioa < 0.30).all(1))[0] # (N, )
|
|
1714
|
+
n = len(indexes)
|
|
1715
|
+
sorted_idx = np.argsort(ioa.max(1)[indexes])
|
|
1716
|
+
indexes = indexes[sorted_idx]
|
|
1717
|
+
for j in indexes[: round(self.p * n)]:
|
|
1718
|
+
cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
|
|
1719
|
+
instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
|
|
1720
|
+
cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
|
|
1721
|
+
|
|
1722
|
+
result = labels2.get("img", cv2.flip(im, 1)) # augment segments
|
|
1723
|
+
i = im_new.astype(bool)
|
|
1724
|
+
im[i] = result[i]
|
|
1725
|
+
|
|
1726
|
+
labels1["img"] = im
|
|
1727
|
+
labels1["cls"] = cls
|
|
1728
|
+
labels1["instances"] = instances
|
|
1729
|
+
return labels1
|
|
1717
1730
|
|
|
1718
1731
|
|
|
1719
1732
|
class Albumentations:
|
|
@@ -2259,9 +2272,9 @@ class RandomLoadText:
|
|
|
2259
2272
|
|
|
2260
2273
|
def v8_transforms(dataset, imgsz, hyp, stretch=False):
|
|
2261
2274
|
"""
|
|
2262
|
-
Applies a series of image transformations for
|
|
2275
|
+
Applies a series of image transformations for training.
|
|
2263
2276
|
|
|
2264
|
-
This function creates a composition of image augmentation techniques to prepare images for
|
|
2277
|
+
This function creates a composition of image augmentation techniques to prepare images for YOLO training.
|
|
2265
2278
|
It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
|
|
2266
2279
|
|
|
2267
2280
|
Args:
|
|
@@ -2280,20 +2293,28 @@ def v8_transforms(dataset, imgsz, hyp, stretch=False):
|
|
|
2280
2293
|
>>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
|
|
2281
2294
|
>>> augmented_data = transforms(dataset[0])
|
|
2282
2295
|
"""
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
shear=hyp.shear,
|
|
2292
|
-
perspective=hyp.perspective,
|
|
2293
|
-
pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
|
|
2294
|
-
),
|
|
2295
|
-
]
|
|
2296
|
+
mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
|
|
2297
|
+
affine = RandomPerspective(
|
|
2298
|
+
degrees=hyp.degrees,
|
|
2299
|
+
translate=hyp.translate,
|
|
2300
|
+
scale=hyp.scale,
|
|
2301
|
+
shear=hyp.shear,
|
|
2302
|
+
perspective=hyp.perspective,
|
|
2303
|
+
pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
|
|
2296
2304
|
)
|
|
2305
|
+
|
|
2306
|
+
pre_transform = Compose([mosaic, affine])
|
|
2307
|
+
if hyp.copy_paste_mode == "flip":
|
|
2308
|
+
pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
|
|
2309
|
+
else:
|
|
2310
|
+
pre_transform.append(
|
|
2311
|
+
CopyPaste(
|
|
2312
|
+
dataset,
|
|
2313
|
+
pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
|
|
2314
|
+
p=hyp.copy_paste,
|
|
2315
|
+
mode=hyp.copy_paste_mode,
|
|
2316
|
+
)
|
|
2317
|
+
)
|
|
2297
2318
|
flip_idx = dataset.data.get("flip_idx", []) # for keypoints augmentation
|
|
2298
2319
|
if dataset.use_keypoints:
|
|
2299
2320
|
kpt_shape = dataset.data.get("kpt_shape", None)
|
ultralytics/engine/trainer.py
CHANGED
|
@@ -538,6 +538,8 @@ class BaseTrainer:
|
|
|
538
538
|
self.best.write_bytes(serialized_ckpt) # save best.pt
|
|
539
539
|
if (self.save_period > 0) and (self.epoch % self.save_period == 0):
|
|
540
540
|
(self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt) # save epoch, i.e. 'epoch3.pt'
|
|
541
|
+
# if self.args.close_mosaic and self.epoch == (self.epochs - self.args.close_mosaic - 1):
|
|
542
|
+
# (self.wdir / "last_mosaic.pt").write_bytes(serialized_ckpt) # save mosaic checkpoint
|
|
541
543
|
|
|
542
544
|
def get_dataset(self):
|
|
543
545
|
"""
|
|
@@ -698,7 +700,12 @@ class BaseTrainer:
|
|
|
698
700
|
resume = True
|
|
699
701
|
self.args = get_cfg(ckpt_args)
|
|
700
702
|
self.args.model = self.args.resume = str(last) # reinstate model
|
|
701
|
-
for k in
|
|
703
|
+
for k in (
|
|
704
|
+
"imgsz",
|
|
705
|
+
"batch",
|
|
706
|
+
"device",
|
|
707
|
+
"close_mosaic",
|
|
708
|
+
): # allow arg updates to reduce memory or update device on resume
|
|
702
709
|
if k in overrides:
|
|
703
710
|
setattr(self.args, k, overrides[k])
|
|
704
711
|
|
|
@@ -20,6 +20,7 @@ Example:
|
|
|
20
20
|
from .block import (
|
|
21
21
|
C1,
|
|
22
22
|
C2,
|
|
23
|
+
C2PSA,
|
|
23
24
|
C3,
|
|
24
25
|
C3TR,
|
|
25
26
|
CIB,
|
|
@@ -38,7 +39,9 @@ from .block import (
|
|
|
38
39
|
C2f,
|
|
39
40
|
C2fAttn,
|
|
40
41
|
C2fCIB,
|
|
42
|
+
C2fPSA,
|
|
41
43
|
C3Ghost,
|
|
44
|
+
C3k2,
|
|
42
45
|
C3x,
|
|
43
46
|
CBFuse,
|
|
44
47
|
CBLinear,
|
|
@@ -110,6 +113,10 @@ __all__ = (
|
|
|
110
113
|
"C2",
|
|
111
114
|
"C3",
|
|
112
115
|
"C2f",
|
|
116
|
+
"C3k2",
|
|
117
|
+
"SCDown",
|
|
118
|
+
"C2fPSA",
|
|
119
|
+
"C2PSA",
|
|
113
120
|
"C2fAttn",
|
|
114
121
|
"C3x",
|
|
115
122
|
"C3TR",
|
|
@@ -149,5 +156,4 @@ __all__ = (
|
|
|
149
156
|
"C2fCIB",
|
|
150
157
|
"Attention",
|
|
151
158
|
"PSA",
|
|
152
|
-
"SCDown",
|
|
153
159
|
)
|
ultralytics/nn/modules/block.py
CHANGED
|
@@ -40,6 +40,9 @@ __all__ = (
|
|
|
40
40
|
"SPPELAN",
|
|
41
41
|
"CBFuse",
|
|
42
42
|
"CBLinear",
|
|
43
|
+
"C3k2",
|
|
44
|
+
"C2fPSA",
|
|
45
|
+
"C2PSA",
|
|
43
46
|
"RepVGGDW",
|
|
44
47
|
"CIB",
|
|
45
48
|
"C2fCIB",
|
|
@@ -696,6 +699,49 @@ class CBFuse(nn.Module):
|
|
|
696
699
|
return torch.sum(torch.stack(res + xs[-1:]), dim=0)
|
|
697
700
|
|
|
698
701
|
|
|
702
|
+
class C3f(nn.Module):
|
|
703
|
+
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
|
704
|
+
|
|
705
|
+
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
|
|
706
|
+
"""Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
|
|
707
|
+
expansion.
|
|
708
|
+
"""
|
|
709
|
+
super().__init__()
|
|
710
|
+
c_ = int(c2 * e) # hidden channels
|
|
711
|
+
self.cv1 = Conv(c1, c_, 1, 1)
|
|
712
|
+
self.cv2 = Conv(c1, c_, 1, 1)
|
|
713
|
+
self.cv3 = Conv((2 + n) * c_, c2, 1) # optional act=FReLU(c2)
|
|
714
|
+
self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
|
|
715
|
+
|
|
716
|
+
def forward(self, x):
|
|
717
|
+
"""Forward pass through C2f layer."""
|
|
718
|
+
y = [self.cv2(x), self.cv1(x)]
|
|
719
|
+
y.extend(m(y[-1]) for m in self.m)
|
|
720
|
+
return self.cv3(torch.cat(y, 1))
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
class C3k2(C2f):
|
|
724
|
+
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
|
725
|
+
|
|
726
|
+
def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
|
|
727
|
+
"""Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
|
|
728
|
+
super().__init__(c1, c2, n, shortcut, g, e)
|
|
729
|
+
self.m = nn.ModuleList(
|
|
730
|
+
C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
class C3k(C3):
|
|
735
|
+
"""C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
|
|
736
|
+
|
|
737
|
+
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
|
|
738
|
+
"""Initializes the C3k module with specified channels, number of layers, and configurations."""
|
|
739
|
+
super().__init__(c1, c2, n, shortcut, g, e)
|
|
740
|
+
c_ = int(c2 * e) # hidden channels
|
|
741
|
+
# self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
|
|
742
|
+
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
|
|
743
|
+
|
|
744
|
+
|
|
699
745
|
class RepVGGDW(torch.nn.Module):
|
|
700
746
|
"""RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
|
|
701
747
|
|
|
@@ -873,25 +919,69 @@ class Attention(nn.Module):
|
|
|
873
919
|
return x
|
|
874
920
|
|
|
875
921
|
|
|
922
|
+
class PSABlock(nn.Module):
|
|
923
|
+
"""
|
|
924
|
+
PSABlock class implementing a Position-Sensitive Attention block for neural networks.
|
|
925
|
+
|
|
926
|
+
This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
|
|
927
|
+
with optional shortcut connections.
|
|
928
|
+
|
|
929
|
+
Attributes:
|
|
930
|
+
attn (Attention): Multi-head attention module.
|
|
931
|
+
ffn (nn.Sequential): Feed-forward neural network module.
|
|
932
|
+
add (bool): Flag indicating whether to add shortcut connections.
|
|
933
|
+
|
|
934
|
+
Methods:
|
|
935
|
+
forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.
|
|
936
|
+
|
|
937
|
+
Examples:
|
|
938
|
+
Create a PSABlock and perform a forward pass
|
|
939
|
+
>>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
|
|
940
|
+
>>> input_tensor = torch.randn(1, 128, 32, 32)
|
|
941
|
+
>>> output_tensor = psablock(input_tensor)
|
|
942
|
+
"""
|
|
943
|
+
|
|
944
|
+
def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
|
|
945
|
+
"""Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""
|
|
946
|
+
super().__init__()
|
|
947
|
+
|
|
948
|
+
self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
|
|
949
|
+
self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
|
|
950
|
+
self.add = shortcut
|
|
951
|
+
|
|
952
|
+
def forward(self, x):
|
|
953
|
+
"""Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
|
|
954
|
+
x = x + self.attn(x) if self.add else self.attn(x)
|
|
955
|
+
x = x + self.ffn(x) if self.add else self.ffn(x)
|
|
956
|
+
return x
|
|
957
|
+
|
|
958
|
+
|
|
876
959
|
class PSA(nn.Module):
|
|
877
960
|
"""
|
|
878
|
-
Position-
|
|
961
|
+
PSA class for implementing Position-Sensitive Attention in neural networks.
|
|
879
962
|
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
c2 (int): Number of output channels.
|
|
883
|
-
e (float): Expansion factor for the intermediate channels. Default is 0.5.
|
|
963
|
+
This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
|
|
964
|
+
input tensors, enhancing feature extraction and processing capabilities.
|
|
884
965
|
|
|
885
966
|
Attributes:
|
|
886
|
-
c (int): Number of
|
|
967
|
+
c (int): Number of hidden channels after applying the initial convolution.
|
|
887
968
|
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
|
|
888
969
|
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
|
|
889
|
-
attn (Attention): Attention module for
|
|
890
|
-
ffn (nn.Sequential): Feed-forward network
|
|
970
|
+
attn (Attention): Attention module for position-sensitive attention.
|
|
971
|
+
ffn (nn.Sequential): Feed-forward network for further processing.
|
|
972
|
+
|
|
973
|
+
Methods:
|
|
974
|
+
forward: Applies position-sensitive attention and feed-forward network to the input tensor.
|
|
975
|
+
|
|
976
|
+
Examples:
|
|
977
|
+
Create a PSA module and apply it to an input tensor
|
|
978
|
+
>>> psa = PSA(c1=128, c2=128, e=0.5)
|
|
979
|
+
>>> input_tensor = torch.randn(1, 128, 64, 64)
|
|
980
|
+
>>> output_tensor = psa.forward(input_tensor)
|
|
891
981
|
"""
|
|
892
982
|
|
|
893
983
|
def __init__(self, c1, c2, e=0.5):
|
|
894
|
-
"""Initializes
|
|
984
|
+
"""Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
|
|
895
985
|
super().__init__()
|
|
896
986
|
assert c1 == c2
|
|
897
987
|
self.c = int(c1 * e)
|
|
@@ -902,46 +992,117 @@ class PSA(nn.Module):
|
|
|
902
992
|
self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
|
|
903
993
|
|
|
904
994
|
def forward(self, x):
|
|
905
|
-
"""
|
|
906
|
-
Forward pass of the PSA module.
|
|
907
|
-
|
|
908
|
-
Args:
|
|
909
|
-
x (torch.Tensor): Input tensor.
|
|
910
|
-
|
|
911
|
-
Returns:
|
|
912
|
-
(torch.Tensor): Output tensor.
|
|
913
|
-
"""
|
|
995
|
+
"""Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor."""
|
|
914
996
|
a, b = self.cv1(x).split((self.c, self.c), dim=1)
|
|
915
997
|
b = b + self.attn(b)
|
|
916
998
|
b = b + self.ffn(b)
|
|
917
999
|
return self.cv2(torch.cat((a, b), 1))
|
|
918
1000
|
|
|
919
1001
|
|
|
1002
|
+
class C2PSA(nn.Module):
|
|
1003
|
+
"""
|
|
1004
|
+
C2PSA module with attention mechanism for enhanced feature extraction and processing.
|
|
1005
|
+
|
|
1006
|
+
This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
|
|
1007
|
+
capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
|
|
1008
|
+
|
|
1009
|
+
Attributes:
|
|
1010
|
+
c (int): Number of hidden channels.
|
|
1011
|
+
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
|
|
1012
|
+
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
|
|
1013
|
+
m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.
|
|
1014
|
+
|
|
1015
|
+
Methods:
|
|
1016
|
+
forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
|
|
1017
|
+
|
|
1018
|
+
Notes:
|
|
1019
|
+
This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
|
|
1020
|
+
|
|
1021
|
+
Examples:
|
|
1022
|
+
>>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
|
|
1023
|
+
>>> input_tensor = torch.randn(1, 256, 64, 64)
|
|
1024
|
+
>>> output_tensor = c2psa(input_tensor)
|
|
1025
|
+
"""
|
|
1026
|
+
|
|
1027
|
+
def __init__(self, c1, c2, n=1, e=0.5):
|
|
1028
|
+
"""Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
|
|
1029
|
+
super().__init__()
|
|
1030
|
+
assert c1 == c2
|
|
1031
|
+
self.c = int(c1 * e)
|
|
1032
|
+
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
|
|
1033
|
+
self.cv2 = Conv(2 * self.c, c1, 1)
|
|
1034
|
+
|
|
1035
|
+
self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
|
|
1036
|
+
|
|
1037
|
+
def forward(self, x):
|
|
1038
|
+
"""Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
|
|
1039
|
+
a, b = self.cv1(x).split((self.c, self.c), dim=1)
|
|
1040
|
+
b = self.m(b)
|
|
1041
|
+
return self.cv2(torch.cat((a, b), 1))
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
class C2fPSA(C2f):
|
|
1045
|
+
"""
|
|
1046
|
+
C2fPSA module with enhanced feature extraction using PSA blocks.
|
|
1047
|
+
|
|
1048
|
+
This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
|
|
1049
|
+
|
|
1050
|
+
Attributes:
|
|
1051
|
+
c (int): Number of hidden channels.
|
|
1052
|
+
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
|
|
1053
|
+
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
|
|
1054
|
+
m (nn.ModuleList): List of PSA blocks for feature extraction.
|
|
1055
|
+
|
|
1056
|
+
Methods:
|
|
1057
|
+
forward: Performs a forward pass through the C2fPSA module.
|
|
1058
|
+
forward_split: Performs a forward pass using split() instead of chunk().
|
|
1059
|
+
|
|
1060
|
+
Examples:
|
|
1061
|
+
>>> import torch
|
|
1062
|
+
>>> from ultralytics.models.common import C2fPSA
|
|
1063
|
+
>>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
|
|
1064
|
+
>>> x = torch.randn(1, 64, 128, 128)
|
|
1065
|
+
>>> output = model(x)
|
|
1066
|
+
>>> print(output.shape)
|
|
1067
|
+
"""
|
|
1068
|
+
|
|
1069
|
+
def __init__(self, c1, c2, n=1, e=0.5):
|
|
1070
|
+
"""Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
|
|
1071
|
+
assert c1 == c2
|
|
1072
|
+
super().__init__(c1, c2, n=n, e=e)
|
|
1073
|
+
self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
|
|
1074
|
+
|
|
1075
|
+
|
|
920
1076
|
class SCDown(nn.Module):
|
|
921
|
-
"""
|
|
1077
|
+
"""
|
|
1078
|
+
SCDown module for downsampling with separable convolutions.
|
|
922
1079
|
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
Spatial Channel Downsample (SCDown) module.
|
|
1080
|
+
This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
|
|
1081
|
+
efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
|
|
926
1082
|
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
1083
|
+
Attributes:
|
|
1084
|
+
cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
|
|
1085
|
+
cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.
|
|
1086
|
+
|
|
1087
|
+
Methods:
|
|
1088
|
+
forward: Applies the SCDown module to the input tensor.
|
|
1089
|
+
|
|
1090
|
+
Examples:
|
|
1091
|
+
>>> import torch
|
|
1092
|
+
>>> from ultralytics import SCDown
|
|
1093
|
+
>>> model = SCDown(c1=64, c2=128, k=3, s=2)
|
|
1094
|
+
>>> x = torch.randn(1, 64, 128, 128)
|
|
1095
|
+
>>> y = model(x)
|
|
1096
|
+
>>> print(y.shape)
|
|
1097
|
+
torch.Size([1, 128, 64, 64])
|
|
1098
|
+
"""
|
|
1099
|
+
|
|
1100
|
+
def __init__(self, c1, c2, k, s):
|
|
1101
|
+
"""Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
|
|
933
1102
|
super().__init__()
|
|
934
1103
|
self.cv1 = Conv(c1, c2, 1, 1)
|
|
935
1104
|
self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
|
|
936
1105
|
|
|
937
1106
|
def forward(self, x):
|
|
938
|
-
"""
|
|
939
|
-
Forward pass of the SCDown module.
|
|
940
|
-
|
|
941
|
-
Args:
|
|
942
|
-
x (torch.Tensor): Input tensor.
|
|
943
|
-
|
|
944
|
-
Returns:
|
|
945
|
-
(torch.Tensor): Output tensor after applying the SCDown module.
|
|
946
|
-
"""
|
|
1107
|
+
"""Applies convolution and downsampling to the input tensor in the SCDown module."""
|
|
947
1108
|
return self.cv2(self.cv1(x))
|
ultralytics/nn/modules/conv.py
CHANGED
|
@@ -209,7 +209,8 @@ class RepConv(nn.Module):
|
|
|
209
209
|
kernelid, biasid = self._fuse_bn_tensor(self.bn)
|
|
210
210
|
return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
|
|
211
211
|
|
|
212
|
-
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _pad_1x1_to_3x3_tensor(kernel1x1):
|
|
213
214
|
"""Pads a 1x1 tensor to a 3x3 tensor."""
|
|
214
215
|
if kernel1x1 is None:
|
|
215
216
|
return 0
|
ultralytics/nn/modules/head.py
CHANGED
|
@@ -11,7 +11,7 @@ from torch.nn.init import constant_, xavier_uniform_
|
|
|
11
11
|
from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
|
|
12
12
|
|
|
13
13
|
from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
|
|
14
|
-
from .conv import Conv
|
|
14
|
+
from .conv import Conv, DWConv
|
|
15
15
|
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
|
|
16
16
|
from .utils import bias_init_with_prob, linear_init
|
|
17
17
|
|
|
@@ -41,7 +41,14 @@ class Detect(nn.Module):
|
|
|
41
41
|
self.cv2 = nn.ModuleList(
|
|
42
42
|
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
|
|
43
43
|
)
|
|
44
|
-
self.cv3 = nn.ModuleList(
|
|
44
|
+
self.cv3 = nn.ModuleList(
|
|
45
|
+
nn.Sequential(
|
|
46
|
+
nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
|
|
47
|
+
nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
|
|
48
|
+
nn.Conv2d(c3, self.nc, 1),
|
|
49
|
+
)
|
|
50
|
+
for x in ch
|
|
51
|
+
)
|
|
45
52
|
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
|
|
46
53
|
|
|
47
54
|
if self.end2end:
|