ultralytics 8.2.37__py3-none-any.whl → 8.2.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultralytics might be problematic. Click here for more details.
- tests/test_python.py +9 -0
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/models/v10/yolov10b.yaml +42 -0
- ultralytics/cfg/models/v10/yolov10l.yaml +42 -0
- ultralytics/cfg/models/v10/yolov10m.yaml +42 -0
- ultralytics/cfg/models/v10/yolov10n.yaml +42 -0
- ultralytics/cfg/models/v10/yolov10s.yaml +42 -0
- ultralytics/cfg/models/v10/yolov10x.yaml +42 -0
- ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
- ultralytics/data/augment.py +13 -16
- ultralytics/data/converter.py +10 -11
- ultralytics/data/split_dota.py +4 -4
- ultralytics/engine/exporter.py +3 -2
- ultralytics/engine/model.py +0 -1
- ultralytics/models/sam/modules/tiny_encoder.py +6 -7
- ultralytics/nn/modules/__init__.py +14 -1
- ultralytics/nn/modules/block.py +256 -1
- ultralytics/nn/modules/head.py +114 -4
- ultralytics/nn/tasks.py +40 -18
- ultralytics/solutions/__init__.py +1 -0
- ultralytics/utils/__init__.py +1 -1
- ultralytics/utils/benchmarks.py +5 -0
- ultralytics/utils/downloads.py +1 -0
- ultralytics/utils/loss.py +20 -2
- ultralytics/utils/metrics.py +2 -1
- ultralytics/utils/ops.py +3 -0
- {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/METADATA +6 -6
- {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/RECORD +32 -26
- {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/LICENSE +0 -0
- {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/WHEEL +0 -0
- {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/top_level.txt +0 -0
ultralytics/nn/modules/block.py
CHANGED
|
@@ -5,6 +5,8 @@ import torch
|
|
|
5
5
|
import torch.nn as nn
|
|
6
6
|
import torch.nn.functional as F
|
|
7
7
|
|
|
8
|
+
from ultralytics.utils.torch_utils import fuse_conv_and_bn
|
|
9
|
+
|
|
8
10
|
from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
|
|
9
11
|
from .transformer import TransformerBlock
|
|
10
12
|
|
|
@@ -38,7 +40,12 @@ __all__ = (
|
|
|
38
40
|
"SPPELAN",
|
|
39
41
|
"CBFuse",
|
|
40
42
|
"CBLinear",
|
|
41
|
-
"
|
|
43
|
+
"RepVGGDW",
|
|
44
|
+
"CIB",
|
|
45
|
+
"C2fCIB",
|
|
46
|
+
"Attention",
|
|
47
|
+
"PSA",
|
|
48
|
+
"SCDown",
|
|
42
49
|
)
|
|
43
50
|
|
|
44
51
|
|
|
@@ -699,3 +706,251 @@ class CBFuse(nn.Module):
|
|
|
699
706
|
target_size = xs[-1].shape[2:]
|
|
700
707
|
res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
|
|
701
708
|
return torch.sum(torch.stack(res + xs[-1:]), dim=0)
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
class RepVGGDW(torch.nn.Module):
|
|
712
|
+
"""RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
|
|
713
|
+
|
|
714
|
+
def __init__(self, ed) -> None:
|
|
715
|
+
super().__init__()
|
|
716
|
+
self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
|
|
717
|
+
self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
|
|
718
|
+
self.dim = ed
|
|
719
|
+
self.act = nn.SiLU()
|
|
720
|
+
|
|
721
|
+
def forward(self, x):
|
|
722
|
+
"""
|
|
723
|
+
Performs a forward pass of the RepVGGDW block.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
x (torch.Tensor): Input tensor.
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
(torch.Tensor): Output tensor after applying the depth wise separable convolution.
|
|
730
|
+
"""
|
|
731
|
+
return self.act(self.conv(x) + self.conv1(x))
|
|
732
|
+
|
|
733
|
+
def forward_fuse(self, x):
|
|
734
|
+
"""
|
|
735
|
+
Performs a forward pass of the RepVGGDW block without fusing the convolutions.
|
|
736
|
+
|
|
737
|
+
Args:
|
|
738
|
+
x (torch.Tensor): Input tensor.
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
(torch.Tensor): Output tensor after applying the depth wise separable convolution.
|
|
742
|
+
"""
|
|
743
|
+
return self.act(self.conv(x))
|
|
744
|
+
|
|
745
|
+
@torch.no_grad()
|
|
746
|
+
def fuse(self):
|
|
747
|
+
"""
|
|
748
|
+
Fuses the convolutional layers in the RepVGGDW block.
|
|
749
|
+
|
|
750
|
+
This method fuses the convolutional layers and updates the weights and biases accordingly.
|
|
751
|
+
"""
|
|
752
|
+
conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
|
|
753
|
+
conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)
|
|
754
|
+
|
|
755
|
+
conv_w = conv.weight
|
|
756
|
+
conv_b = conv.bias
|
|
757
|
+
conv1_w = conv1.weight
|
|
758
|
+
conv1_b = conv1.bias
|
|
759
|
+
|
|
760
|
+
conv1_w = torch.nn.functional.pad(conv1_w, [2, 2, 2, 2])
|
|
761
|
+
|
|
762
|
+
final_conv_w = conv_w + conv1_w
|
|
763
|
+
final_conv_b = conv_b + conv1_b
|
|
764
|
+
|
|
765
|
+
conv.weight.data.copy_(final_conv_w)
|
|
766
|
+
conv.bias.data.copy_(final_conv_b)
|
|
767
|
+
|
|
768
|
+
self.conv = conv
|
|
769
|
+
del self.conv1
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
class CIB(nn.Module):
|
|
773
|
+
"""
|
|
774
|
+
Conditional Identity Block (CIB) module.
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
c1 (int): Number of input channels.
|
|
778
|
+
c2 (int): Number of output channels.
|
|
779
|
+
shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
|
|
780
|
+
e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
|
|
781
|
+
lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
|
|
782
|
+
"""
|
|
783
|
+
|
|
784
|
+
def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
|
|
785
|
+
"""Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""
|
|
786
|
+
super().__init__()
|
|
787
|
+
c_ = int(c2 * e) # hidden channels
|
|
788
|
+
self.cv1 = nn.Sequential(
|
|
789
|
+
Conv(c1, c1, 3, g=c1),
|
|
790
|
+
Conv(c1, 2 * c_, 1),
|
|
791
|
+
RepVGGDW(2 * c_) if lk else Conv(2 * c_, 2 * c_, 3, g=2 * c_),
|
|
792
|
+
Conv(2 * c_, c2, 1),
|
|
793
|
+
Conv(c2, c2, 3, g=c2),
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
self.add = shortcut and c1 == c2
|
|
797
|
+
|
|
798
|
+
def forward(self, x):
|
|
799
|
+
"""
|
|
800
|
+
Forward pass of the CIB module.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
x (torch.Tensor): Input tensor.
|
|
804
|
+
|
|
805
|
+
Returns:
|
|
806
|
+
(torch.Tensor): Output tensor.
|
|
807
|
+
"""
|
|
808
|
+
return x + self.cv1(x) if self.add else self.cv1(x)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
class C2fCIB(C2f):
|
|
812
|
+
"""
|
|
813
|
+
C2fCIB class represents a convolutional block with C2f and CIB modules.
|
|
814
|
+
|
|
815
|
+
Args:
|
|
816
|
+
c1 (int): Number of input channels.
|
|
817
|
+
c2 (int): Number of output channels.
|
|
818
|
+
n (int, optional): Number of CIB modules to stack. Defaults to 1.
|
|
819
|
+
shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
|
|
820
|
+
lk (bool, optional): Whether to use local key connection. Defaults to False.
|
|
821
|
+
g (int, optional): Number of groups for grouped convolution. Defaults to 1.
|
|
822
|
+
e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
|
|
823
|
+
"""
|
|
824
|
+
|
|
825
|
+
def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
|
|
826
|
+
"""Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
|
|
827
|
+
super().__init__(c1, c2, n, shortcut, g, e)
|
|
828
|
+
self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
class Attention(nn.Module):
|
|
832
|
+
"""
|
|
833
|
+
Attention module that performs self-attention on the input tensor.
|
|
834
|
+
|
|
835
|
+
Args:
|
|
836
|
+
dim (int): The input tensor dimension.
|
|
837
|
+
num_heads (int): The number of attention heads.
|
|
838
|
+
attn_ratio (float): The ratio of the attention key dimension to the head dimension.
|
|
839
|
+
|
|
840
|
+
Attributes:
|
|
841
|
+
num_heads (int): The number of attention heads.
|
|
842
|
+
head_dim (int): The dimension of each attention head.
|
|
843
|
+
key_dim (int): The dimension of the attention key.
|
|
844
|
+
scale (float): The scaling factor for the attention scores.
|
|
845
|
+
qkv (Conv): Convolutional layer for computing the query, key, and value.
|
|
846
|
+
proj (Conv): Convolutional layer for projecting the attended values.
|
|
847
|
+
pe (Conv): Convolutional layer for positional encoding.
|
|
848
|
+
"""
|
|
849
|
+
|
|
850
|
+
def __init__(self, dim, num_heads=8, attn_ratio=0.5):
|
|
851
|
+
"""Initializes multi-head attention module with query, key, and value convolutions and positional encoding."""
|
|
852
|
+
super().__init__()
|
|
853
|
+
self.num_heads = num_heads
|
|
854
|
+
self.head_dim = dim // num_heads
|
|
855
|
+
self.key_dim = int(self.head_dim * attn_ratio)
|
|
856
|
+
self.scale = self.key_dim**-0.5
|
|
857
|
+
nh_kd = nh_kd = self.key_dim * num_heads
|
|
858
|
+
h = dim + nh_kd * 2
|
|
859
|
+
self.qkv = Conv(dim, h, 1, act=False)
|
|
860
|
+
self.proj = Conv(dim, dim, 1, act=False)
|
|
861
|
+
self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
|
|
862
|
+
|
|
863
|
+
def forward(self, x):
|
|
864
|
+
"""
|
|
865
|
+
Forward pass of the Attention module.
|
|
866
|
+
|
|
867
|
+
Args:
|
|
868
|
+
x (torch.Tensor): The input tensor.
|
|
869
|
+
|
|
870
|
+
Returns:
|
|
871
|
+
(torch.Tensor): The output tensor after self-attention.
|
|
872
|
+
"""
|
|
873
|
+
B, C, H, W = x.shape
|
|
874
|
+
N = H * W
|
|
875
|
+
qkv = self.qkv(x)
|
|
876
|
+
q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
|
|
877
|
+
[self.key_dim, self.key_dim, self.head_dim], dim=2
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
attn = (q.transpose(-2, -1) @ k) * self.scale
|
|
881
|
+
attn = attn.softmax(dim=-1)
|
|
882
|
+
x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
|
|
883
|
+
x = self.proj(x)
|
|
884
|
+
return x
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
class PSA(nn.Module):
|
|
888
|
+
"""
|
|
889
|
+
Position-wise Spatial Attention module.
|
|
890
|
+
|
|
891
|
+
Args:
|
|
892
|
+
c1 (int): Number of input channels.
|
|
893
|
+
c2 (int): Number of output channels.
|
|
894
|
+
e (float): Expansion factor for the intermediate channels. Default is 0.5.
|
|
895
|
+
|
|
896
|
+
Attributes:
|
|
897
|
+
c (int): Number of intermediate channels.
|
|
898
|
+
cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
|
|
899
|
+
cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
|
|
900
|
+
attn (Attention): Attention module for spatial attention.
|
|
901
|
+
ffn (nn.Sequential): Feed-forward network module.
|
|
902
|
+
"""
|
|
903
|
+
|
|
904
|
+
def __init__(self, c1, c2, e=0.5):
|
|
905
|
+
"""Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
|
|
906
|
+
super().__init__()
|
|
907
|
+
assert c1 == c2
|
|
908
|
+
self.c = int(c1 * e)
|
|
909
|
+
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
|
|
910
|
+
self.cv2 = Conv(2 * self.c, c1, 1)
|
|
911
|
+
|
|
912
|
+
self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
|
|
913
|
+
self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
|
|
914
|
+
|
|
915
|
+
def forward(self, x):
|
|
916
|
+
"""
|
|
917
|
+
Forward pass of the PSA module.
|
|
918
|
+
|
|
919
|
+
Args:
|
|
920
|
+
x (torch.Tensor): Input tensor.
|
|
921
|
+
|
|
922
|
+
Returns:
|
|
923
|
+
(torch.Tensor): Output tensor.
|
|
924
|
+
"""
|
|
925
|
+
a, b = self.cv1(x).split((self.c, self.c), dim=1)
|
|
926
|
+
b = b + self.attn(b)
|
|
927
|
+
b = b + self.ffn(b)
|
|
928
|
+
return self.cv2(torch.cat((a, b), 1))
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
class SCDown(nn.Module):
|
|
932
|
+
def __init__(self, c1, c2, k, s):
|
|
933
|
+
"""
|
|
934
|
+
Spatial Channel Downsample (SCDown) module.
|
|
935
|
+
|
|
936
|
+
Args:
|
|
937
|
+
c1 (int): Number of input channels.
|
|
938
|
+
c2 (int): Number of output channels.
|
|
939
|
+
k (int): Kernel size for the convolutional layer.
|
|
940
|
+
s (int): Stride for the convolutional layer.
|
|
941
|
+
"""
|
|
942
|
+
super().__init__()
|
|
943
|
+
self.cv1 = Conv(c1, c2, 1, 1)
|
|
944
|
+
self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
|
|
945
|
+
|
|
946
|
+
def forward(self, x):
|
|
947
|
+
"""
|
|
948
|
+
Forward pass of the SCDown module.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
x (torch.Tensor): Input tensor.
|
|
952
|
+
|
|
953
|
+
Returns:
|
|
954
|
+
(torch.Tensor): Output tensor after applying the SCDown module.
|
|
955
|
+
"""
|
|
956
|
+
return self.cv2(self.cv1(x))
|
ultralytics/nn/modules/head.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
|
2
2
|
"""Model head modules."""
|
|
3
3
|
|
|
4
|
+
import copy
|
|
4
5
|
import math
|
|
5
6
|
|
|
6
7
|
import torch
|
|
@@ -14,7 +15,7 @@ from .conv import Conv
|
|
|
14
15
|
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
|
|
15
16
|
from .utils import bias_init_with_prob, linear_init
|
|
16
17
|
|
|
17
|
-
__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
|
|
18
|
+
__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect"
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class Detect(nn.Module):
|
|
@@ -22,6 +23,8 @@ class Detect(nn.Module):
|
|
|
22
23
|
|
|
23
24
|
dynamic = False # force grid reconstruction
|
|
24
25
|
export = False # export mode
|
|
26
|
+
end2end = False # end2end
|
|
27
|
+
max_det = 300 # max_det
|
|
25
28
|
shape = None
|
|
26
29
|
anchors = torch.empty(0) # init
|
|
27
30
|
strides = torch.empty(0) # init
|
|
@@ -41,13 +44,48 @@ class Detect(nn.Module):
|
|
|
41
44
|
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
|
|
42
45
|
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
|
|
43
46
|
|
|
47
|
+
if self.end2end:
|
|
48
|
+
self.one2one_cv2 = copy.deepcopy(self.cv2)
|
|
49
|
+
self.one2one_cv3 = copy.deepcopy(self.cv3)
|
|
50
|
+
|
|
44
51
|
def forward(self, x):
|
|
45
52
|
"""Concatenates and returns predicted bounding boxes and class probabilities."""
|
|
53
|
+
if self.end2end:
|
|
54
|
+
return self.forward_end2end(x)
|
|
55
|
+
|
|
46
56
|
for i in range(self.nl):
|
|
47
57
|
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
|
|
48
58
|
if self.training: # Training path
|
|
49
59
|
return x
|
|
60
|
+
y = self._inference(x)
|
|
61
|
+
return y if self.export else (y, x)
|
|
62
|
+
|
|
63
|
+
def forward_end2end(self, x):
|
|
64
|
+
"""
|
|
65
|
+
Performs forward pass of the v10Detect module.
|
|
50
66
|
|
|
67
|
+
Args:
|
|
68
|
+
x (tensor): Input tensor.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
(dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
|
|
72
|
+
If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
|
|
73
|
+
"""
|
|
74
|
+
x_detach = [xi.detach() for xi in x]
|
|
75
|
+
one2one = [
|
|
76
|
+
torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
|
|
77
|
+
]
|
|
78
|
+
for i in range(self.nl):
|
|
79
|
+
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
|
|
80
|
+
if self.training: # Training path
|
|
81
|
+
return {"one2many": x, "one2one": one2one}
|
|
82
|
+
|
|
83
|
+
y = self._inference(one2one)
|
|
84
|
+
y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
|
|
85
|
+
return y if self.export else (y, {"one2many": x, "one2one": one2one})
|
|
86
|
+
|
|
87
|
+
def _inference(self, x):
|
|
88
|
+
"""Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
|
|
51
89
|
# Inference path
|
|
52
90
|
shape = x[0].shape # BCHW
|
|
53
91
|
x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
|
|
@@ -72,8 +110,7 @@ class Detect(nn.Module):
|
|
|
72
110
|
else:
|
|
73
111
|
dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
|
|
74
112
|
|
|
75
|
-
|
|
76
|
-
return y if self.export else (y, x)
|
|
113
|
+
return torch.cat((dbox, cls.sigmoid()), 1)
|
|
77
114
|
|
|
78
115
|
def bias_init(self):
|
|
79
116
|
"""Initialize Detect() biases, WARNING: requires stride availability."""
|
|
@@ -83,10 +120,47 @@ class Detect(nn.Module):
|
|
|
83
120
|
for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
|
|
84
121
|
a[-1].bias.data[:] = 1.0 # box
|
|
85
122
|
b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
|
123
|
+
if self.end2end:
|
|
124
|
+
for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride): # from
|
|
125
|
+
a[-1].bias.data[:] = 1.0 # box
|
|
126
|
+
b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
|
|
86
127
|
|
|
87
128
|
def decode_bboxes(self, bboxes, anchors):
|
|
88
129
|
"""Decode bounding boxes."""
|
|
89
|
-
return dist2bbox(bboxes, anchors, xywh=
|
|
130
|
+
return dist2bbox(bboxes, anchors, xywh=not self.end2end, dim=1)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
|
|
134
|
+
"""
|
|
135
|
+
Post-processes the predictions obtained from a YOLOv10 model.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
preds (torch.Tensor): The predictions obtained from the model. It should have a shape of (batch_size, num_boxes, 4 + num_classes).
|
|
139
|
+
max_det (int): The maximum number of detections to keep.
|
|
140
|
+
nc (int, optional): The number of classes. Defaults to 80.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
(torch.Tensor): The post-processed predictions with shape (batch_size, max_det, 6),
|
|
144
|
+
including bounding boxes, scores and cls.
|
|
145
|
+
"""
|
|
146
|
+
assert 4 + nc == preds.shape[-1]
|
|
147
|
+
boxes, scores = preds.split([4, nc], dim=-1)
|
|
148
|
+
max_scores = scores.amax(dim=-1)
|
|
149
|
+
max_scores, index = torch.topk(max_scores, min(max_det, max_scores.shape[1]), axis=-1)
|
|
150
|
+
index = index.unsqueeze(-1)
|
|
151
|
+
boxes = torch.gather(boxes, dim=1, index=index.repeat(1, 1, boxes.shape[-1]))
|
|
152
|
+
scores = torch.gather(scores, dim=1, index=index.repeat(1, 1, scores.shape[-1]))
|
|
153
|
+
|
|
154
|
+
# NOTE: simplify but result slightly lower mAP
|
|
155
|
+
# scores, labels = scores.max(dim=-1)
|
|
156
|
+
# return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)
|
|
157
|
+
|
|
158
|
+
scores, index = torch.topk(scores.flatten(1), max_det, axis=-1)
|
|
159
|
+
labels = index % nc
|
|
160
|
+
index = index // nc
|
|
161
|
+
boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
|
|
162
|
+
|
|
163
|
+
return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1).to(boxes.dtype)], dim=-1)
|
|
90
164
|
|
|
91
165
|
|
|
92
166
|
class Segment(Detect):
|
|
@@ -487,3 +561,39 @@ class RTDETRDecoder(nn.Module):
|
|
|
487
561
|
xavier_uniform_(self.query_pos_head.layers[1].weight)
|
|
488
562
|
for layer in self.input_proj:
|
|
489
563
|
xavier_uniform_(layer[0].weight)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
class v10Detect(Detect):
|
|
567
|
+
"""
|
|
568
|
+
v10 Detection head from https://arxiv.org/pdf/2405.14458
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
nc (int): Number of classes.
|
|
572
|
+
ch (tuple): Tuple of channel sizes.
|
|
573
|
+
|
|
574
|
+
Attributes:
|
|
575
|
+
max_det (int): Maximum number of detections.
|
|
576
|
+
|
|
577
|
+
Methods:
|
|
578
|
+
__init__(self, nc=80, ch=()): Initializes the v10Detect object.
|
|
579
|
+
forward(self, x): Performs forward pass of the v10Detect module.
|
|
580
|
+
bias_init(self): Initializes biases of the Detect module.
|
|
581
|
+
|
|
582
|
+
"""
|
|
583
|
+
|
|
584
|
+
end2end = True
|
|
585
|
+
|
|
586
|
+
def __init__(self, nc=80, ch=()):
|
|
587
|
+
"""Initializes the v10Detect object with the specified number of classes and input channels."""
|
|
588
|
+
super().__init__(nc, ch)
|
|
589
|
+
c3 = max(ch[0], min(self.nc, 100)) # channels
|
|
590
|
+
# Light cls head
|
|
591
|
+
self.cv3 = nn.ModuleList(
|
|
592
|
+
nn.Sequential(
|
|
593
|
+
nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)),
|
|
594
|
+
nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
|
|
595
|
+
nn.Conv2d(c3, self.nc, 1),
|
|
596
|
+
)
|
|
597
|
+
for x in ch
|
|
598
|
+
)
|
|
599
|
+
self.one2one_cv3 = copy.deepcopy(self.cv3)
|
ultralytics/nn/tasks.py
CHANGED
|
@@ -15,6 +15,7 @@ from ultralytics.nn.modules import (
|
|
|
15
15
|
C3TR,
|
|
16
16
|
ELAN1,
|
|
17
17
|
OBB,
|
|
18
|
+
PSA,
|
|
18
19
|
SPP,
|
|
19
20
|
SPPELAN,
|
|
20
21
|
SPPF,
|
|
@@ -24,6 +25,7 @@ from ultralytics.nn.modules import (
|
|
|
24
25
|
BottleneckCSP,
|
|
25
26
|
C2f,
|
|
26
27
|
C2fAttn,
|
|
28
|
+
C2fCIB,
|
|
27
29
|
C3Ghost,
|
|
28
30
|
C3x,
|
|
29
31
|
CBFuse,
|
|
@@ -46,14 +48,24 @@ from ultralytics.nn.modules import (
|
|
|
46
48
|
RepC3,
|
|
47
49
|
RepConv,
|
|
48
50
|
RepNCSPELAN4,
|
|
51
|
+
RepVGGDW,
|
|
49
52
|
ResNetLayer,
|
|
50
53
|
RTDETRDecoder,
|
|
54
|
+
SCDown,
|
|
51
55
|
Segment,
|
|
52
56
|
WorldDetect,
|
|
57
|
+
v10Detect,
|
|
53
58
|
)
|
|
54
59
|
from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
|
|
55
60
|
from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
|
|
56
|
-
from ultralytics.utils.loss import
|
|
61
|
+
from ultralytics.utils.loss import (
|
|
62
|
+
E2EDetectLoss,
|
|
63
|
+
v8ClassificationLoss,
|
|
64
|
+
v8DetectionLoss,
|
|
65
|
+
v8OBBLoss,
|
|
66
|
+
v8PoseLoss,
|
|
67
|
+
v8SegmentationLoss,
|
|
68
|
+
)
|
|
57
69
|
from ultralytics.utils.plotting import feature_visualization
|
|
58
70
|
from ultralytics.utils.torch_utils import (
|
|
59
71
|
fuse_conv_and_bn,
|
|
@@ -192,6 +204,9 @@ class BaseModel(nn.Module):
|
|
|
192
204
|
if isinstance(m, RepConv):
|
|
193
205
|
m.fuse_convs()
|
|
194
206
|
m.forward = m.forward_fuse # update forward
|
|
207
|
+
if isinstance(m, RepVGGDW):
|
|
208
|
+
m.fuse()
|
|
209
|
+
m.forward = m.forward_fuse
|
|
195
210
|
self.info(verbose=verbose)
|
|
196
211
|
|
|
197
212
|
return self
|
|
@@ -294,6 +309,7 @@ class DetectionModel(BaseModel):
|
|
|
294
309
|
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
|
|
295
310
|
self.names = {i: f"{i}" for i in range(self.yaml["nc"])} # default names dict
|
|
296
311
|
self.inplace = self.yaml.get("inplace", True)
|
|
312
|
+
self.end2end = getattr(self.model[-1], "end2end", False)
|
|
297
313
|
|
|
298
314
|
# Build strides
|
|
299
315
|
m = self.model[-1] # Detect()
|
|
@@ -303,6 +319,8 @@ class DetectionModel(BaseModel):
|
|
|
303
319
|
|
|
304
320
|
def _forward(x):
|
|
305
321
|
"""Performs a forward pass through the model, handling different Detect subclass types accordingly."""
|
|
322
|
+
if self.end2end:
|
|
323
|
+
return self.forward(x)["one2many"]
|
|
306
324
|
return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
|
|
307
325
|
|
|
308
326
|
m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
|
|
@@ -355,7 +373,7 @@ class DetectionModel(BaseModel):
|
|
|
355
373
|
|
|
356
374
|
def init_criterion(self):
|
|
357
375
|
"""Initialize the loss criterion for the DetectionModel."""
|
|
358
|
-
return v8DetectionLoss(self)
|
|
376
|
+
return E2EDetectLoss(self) if self.end2end else v8DetectionLoss(self)
|
|
359
377
|
|
|
360
378
|
|
|
361
379
|
class OBBModel(DetectionModel):
|
|
@@ -675,7 +693,7 @@ class Ensemble(nn.ModuleList):
|
|
|
675
693
|
|
|
676
694
|
|
|
677
695
|
@contextlib.contextmanager
|
|
678
|
-
def temporary_modules(modules=
|
|
696
|
+
def temporary_modules(modules=None, attributes=None):
|
|
679
697
|
"""
|
|
680
698
|
Context manager for temporarily adding or modifying modules in Python's module cache (`sys.modules`).
|
|
681
699
|
|
|
@@ -689,8 +707,8 @@ def temporary_modules(modules={}, attributes={}):
|
|
|
689
707
|
|
|
690
708
|
Example:
|
|
691
709
|
```python
|
|
692
|
-
with temporary_modules({'old.module
|
|
693
|
-
import old.module
|
|
710
|
+
with temporary_modules({'old.module': 'new.module'}, {'old.module.attribute': 'new.module.attribute'}):
|
|
711
|
+
import old.module # this will now import new.module
|
|
694
712
|
from old.module import attribute # this will now import new.module.attribute
|
|
695
713
|
```
|
|
696
714
|
|
|
@@ -700,23 +718,23 @@ def temporary_modules(modules={}, attributes={}):
|
|
|
700
718
|
applications or libraries. Use this function with caution.
|
|
701
719
|
"""
|
|
702
720
|
|
|
703
|
-
|
|
721
|
+
if modules is None:
|
|
722
|
+
modules = {}
|
|
723
|
+
if attributes is None:
|
|
724
|
+
attributes = {}
|
|
704
725
|
import sys
|
|
726
|
+
from importlib import import_module
|
|
705
727
|
|
|
706
728
|
try:
|
|
707
729
|
# Set attributes in sys.modules under their old name
|
|
708
730
|
for old, new in attributes.items():
|
|
709
731
|
old_module, old_attr = old.rsplit(".", 1)
|
|
710
732
|
new_module, new_attr = new.rsplit(".", 1)
|
|
711
|
-
setattr(
|
|
712
|
-
importlib.import_module(old_module),
|
|
713
|
-
old_attr,
|
|
714
|
-
getattr(importlib.import_module(new_module), new_attr),
|
|
715
|
-
)
|
|
733
|
+
setattr(import_module(old_module), old_attr, getattr(import_module(new_module), new_attr))
|
|
716
734
|
|
|
717
735
|
# Set modules in sys.modules under their old name
|
|
718
736
|
for old, new in modules.items():
|
|
719
|
-
sys.modules[old] =
|
|
737
|
+
sys.modules[old] = import_module(new)
|
|
720
738
|
|
|
721
739
|
yield
|
|
722
740
|
finally:
|
|
@@ -750,9 +768,10 @@ def torch_safe_load(weight):
|
|
|
750
768
|
"ultralytics.yolo.data": "ultralytics.data",
|
|
751
769
|
},
|
|
752
770
|
attributes={
|
|
753
|
-
"ultralytics.nn.modules.block.Silence": "torch.nn.Identity",
|
|
771
|
+
"ultralytics.nn.modules.block.Silence": "torch.nn.Identity", # YOLOv9e
|
|
772
|
+
"ultralytics.nn.tasks.YOLOv10DetectionModel": "ultralytics.nn.tasks.DetectionModel", # YOLOv10
|
|
754
773
|
},
|
|
755
|
-
):
|
|
774
|
+
):
|
|
756
775
|
ckpt = torch.load(file, map_location="cpu")
|
|
757
776
|
|
|
758
777
|
except ModuleNotFoundError as e: # e.name is missing module name
|
|
@@ -911,6 +930,9 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|
|
911
930
|
DWConvTranspose2d,
|
|
912
931
|
C3x,
|
|
913
932
|
RepC3,
|
|
933
|
+
PSA,
|
|
934
|
+
SCDown,
|
|
935
|
+
C2fCIB,
|
|
914
936
|
}:
|
|
915
937
|
c1, c2 = ch[f], args[0]
|
|
916
938
|
if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
|
|
@@ -922,7 +944,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|
|
922
944
|
) # num heads
|
|
923
945
|
|
|
924
946
|
args = [c1, c2, *args[1:]]
|
|
925
|
-
if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3}:
|
|
947
|
+
if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3, C2fCIB}:
|
|
926
948
|
args.insert(2, n) # number of repeats
|
|
927
949
|
n = 1
|
|
928
950
|
elif m is AIFI:
|
|
@@ -939,7 +961,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|
|
939
961
|
args = [ch[f]]
|
|
940
962
|
elif m is Concat:
|
|
941
963
|
c2 = sum(ch[x] for x in f)
|
|
942
|
-
elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn}:
|
|
964
|
+
elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn, v10Detect}:
|
|
943
965
|
args.append([ch[x] for x in f])
|
|
944
966
|
if m is Segment:
|
|
945
967
|
args[2] = make_divisible(min(args[2], max_channels) * width, 8)
|
|
@@ -1024,7 +1046,7 @@ def guess_model_task(model):
|
|
|
1024
1046
|
m = cfg["head"][-1][-2].lower() # output module name
|
|
1025
1047
|
if m in {"classify", "classifier", "cls", "fc"}:
|
|
1026
1048
|
return "classify"
|
|
1027
|
-
if
|
|
1049
|
+
if "detect" in m:
|
|
1028
1050
|
return "detect"
|
|
1029
1051
|
if m == "segment":
|
|
1030
1052
|
return "segment"
|
|
@@ -1056,7 +1078,7 @@ def guess_model_task(model):
|
|
|
1056
1078
|
return "pose"
|
|
1057
1079
|
elif isinstance(m, OBB):
|
|
1058
1080
|
return "obb"
|
|
1059
|
-
elif isinstance(m, (Detect, WorldDetect)):
|
|
1081
|
+
elif isinstance(m, (Detect, WorldDetect, v10Detect)):
|
|
1060
1082
|
return "detect"
|
|
1061
1083
|
|
|
1062
1084
|
# Guess from model filename
|
ultralytics/utils/__init__.py
CHANGED
|
@@ -1070,7 +1070,7 @@ TESTS_RUNNING = is_pytest_running() or is_github_action_running()
|
|
|
1070
1070
|
set_sentry()
|
|
1071
1071
|
|
|
1072
1072
|
# Apply monkey patches
|
|
1073
|
-
from .patches import imread, imshow, imwrite, torch_save
|
|
1073
|
+
from ultralytics.utils.patches import imread, imshow, imwrite, torch_save
|
|
1074
1074
|
|
|
1075
1075
|
torch.save = torch_save
|
|
1076
1076
|
if WINDOWS:
|
ultralytics/utils/benchmarks.py
CHANGED
|
@@ -81,6 +81,7 @@ def benchmark(
|
|
|
81
81
|
device = select_device(device, verbose=False)
|
|
82
82
|
if isinstance(model, (str, Path)):
|
|
83
83
|
model = YOLO(model)
|
|
84
|
+
is_end2end = getattr(model.model.model[-1], "end2end", False)
|
|
84
85
|
|
|
85
86
|
y = []
|
|
86
87
|
t0 = time.time()
|
|
@@ -96,14 +97,18 @@ def benchmark(
|
|
|
96
97
|
assert MACOS or LINUX, "CoreML and TF.js export only supported on macOS and Linux"
|
|
97
98
|
assert not IS_RASPBERRYPI, "CoreML and TF.js export not supported on Raspberry Pi"
|
|
98
99
|
assert not IS_JETSON, "CoreML and TF.js export not supported on NVIDIA Jetson"
|
|
100
|
+
assert not is_end2end, "End-to-end models not supported by CoreML and TF.js yet"
|
|
99
101
|
if i in {3, 5}: # CoreML and OpenVINO
|
|
100
102
|
assert not IS_PYTHON_3_12, "CoreML and OpenVINO not supported on Python 3.12"
|
|
101
103
|
if i in {6, 7, 8, 9, 10}: # All TF formats
|
|
102
104
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
|
|
105
|
+
assert not is_end2end, "End-to-end models not supported by onnx2tf yet"
|
|
103
106
|
if i in {11}: # Paddle
|
|
104
107
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
|
|
108
|
+
assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
|
|
105
109
|
if i in {12}: # NCNN
|
|
106
110
|
assert not isinstance(model, YOLOWorld), "YOLOWorldv2 NCNN exports not supported yet"
|
|
111
|
+
assert not is_end2end, "End-to-end models not supported by NCNN yet"
|
|
107
112
|
if "cpu" in device.type:
|
|
108
113
|
assert cpu, "inference not supported on CPU"
|
|
109
114
|
if "cuda" in device.type:
|
ultralytics/utils/downloads.py
CHANGED
|
@@ -23,6 +23,7 @@ GITHUB_ASSETS_NAMES = (
|
|
|
23
23
|
+ [f"yolov8{k}-world.pt" for k in "smlx"]
|
|
24
24
|
+ [f"yolov8{k}-worldv2.pt" for k in "smlx"]
|
|
25
25
|
+ [f"yolov9{k}.pt" for k in "ce"]
|
|
26
|
+
+ [f"yolov10{k}.pt" for k in "nsmblx"]
|
|
26
27
|
+ [f"yolo_nas_{k}.pt" for k in "sml"]
|
|
27
28
|
+ [f"sam_{k}.pt" for k in "bl"]
|
|
28
29
|
+ [f"FastSAM-{k}.pt" for k in "sx"]
|