ultralytics 8.2.37__py3-none-any.whl → 8.2.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultralytics might be problematic. Click here for more details.

Files changed (32) hide show
  1. tests/test_python.py +9 -0
  2. ultralytics/__init__.py +1 -1
  3. ultralytics/cfg/models/v10/yolov10b.yaml +42 -0
  4. ultralytics/cfg/models/v10/yolov10l.yaml +42 -0
  5. ultralytics/cfg/models/v10/yolov10m.yaml +42 -0
  6. ultralytics/cfg/models/v10/yolov10n.yaml +42 -0
  7. ultralytics/cfg/models/v10/yolov10s.yaml +42 -0
  8. ultralytics/cfg/models/v10/yolov10x.yaml +42 -0
  9. ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
  10. ultralytics/data/augment.py +13 -16
  11. ultralytics/data/converter.py +10 -11
  12. ultralytics/data/split_dota.py +4 -4
  13. ultralytics/engine/exporter.py +3 -2
  14. ultralytics/engine/model.py +0 -1
  15. ultralytics/models/sam/modules/tiny_encoder.py +6 -7
  16. ultralytics/nn/modules/__init__.py +14 -1
  17. ultralytics/nn/modules/block.py +256 -1
  18. ultralytics/nn/modules/head.py +114 -4
  19. ultralytics/nn/tasks.py +40 -18
  20. ultralytics/solutions/__init__.py +1 -0
  21. ultralytics/utils/__init__.py +1 -1
  22. ultralytics/utils/benchmarks.py +5 -0
  23. ultralytics/utils/downloads.py +1 -0
  24. ultralytics/utils/loss.py +20 -2
  25. ultralytics/utils/metrics.py +2 -1
  26. ultralytics/utils/ops.py +3 -0
  27. {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/METADATA +6 -6
  28. {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/RECORD +32 -26
  29. {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/LICENSE +0 -0
  30. {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/WHEEL +0 -0
  31. {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/entry_points.txt +0 -0
  32. {ultralytics-8.2.37.dist-info → ultralytics-8.2.39.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,8 @@ import torch
5
5
  import torch.nn as nn
6
6
  import torch.nn.functional as F
7
7
 
8
+ from ultralytics.utils.torch_utils import fuse_conv_and_bn
9
+
8
10
  from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
9
11
  from .transformer import TransformerBlock
10
12
 
@@ -38,7 +40,12 @@ __all__ = (
38
40
  "SPPELAN",
39
41
  "CBFuse",
40
42
  "CBLinear",
41
- "Silence",
43
+ "RepVGGDW",
44
+ "CIB",
45
+ "C2fCIB",
46
+ "Attention",
47
+ "PSA",
48
+ "SCDown",
42
49
  )
43
50
 
44
51
 
@@ -699,3 +706,251 @@ class CBFuse(nn.Module):
699
706
  target_size = xs[-1].shape[2:]
700
707
  res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
701
708
  return torch.sum(torch.stack(res + xs[-1:]), dim=0)
709
+
710
+
711
+ class RepVGGDW(torch.nn.Module):
712
+ """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
713
+
714
+ def __init__(self, ed) -> None:
715
+ super().__init__()
716
+ self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
717
+ self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
718
+ self.dim = ed
719
+ self.act = nn.SiLU()
720
+
721
+ def forward(self, x):
722
+ """
723
+ Performs a forward pass of the RepVGGDW block.
724
+
725
+ Args:
726
+ x (torch.Tensor): Input tensor.
727
+
728
+ Returns:
729
+ (torch.Tensor): Output tensor after applying the depth wise separable convolution.
730
+ """
731
+ return self.act(self.conv(x) + self.conv1(x))
732
+
733
+ def forward_fuse(self, x):
734
+ """
735
+ Performs a forward pass of the RepVGGDW block without fusing the convolutions.
736
+
737
+ Args:
738
+ x (torch.Tensor): Input tensor.
739
+
740
+ Returns:
741
+ (torch.Tensor): Output tensor after applying the depth wise separable convolution.
742
+ """
743
+ return self.act(self.conv(x))
744
+
745
+ @torch.no_grad()
746
+ def fuse(self):
747
+ """
748
+ Fuses the convolutional layers in the RepVGGDW block.
749
+
750
+ This method fuses the convolutional layers and updates the weights and biases accordingly.
751
+ """
752
+ conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
753
+ conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)
754
+
755
+ conv_w = conv.weight
756
+ conv_b = conv.bias
757
+ conv1_w = conv1.weight
758
+ conv1_b = conv1.bias
759
+
760
+ conv1_w = torch.nn.functional.pad(conv1_w, [2, 2, 2, 2])
761
+
762
+ final_conv_w = conv_w + conv1_w
763
+ final_conv_b = conv_b + conv1_b
764
+
765
+ conv.weight.data.copy_(final_conv_w)
766
+ conv.bias.data.copy_(final_conv_b)
767
+
768
+ self.conv = conv
769
+ del self.conv1
770
+
771
+
772
+ class CIB(nn.Module):
773
+ """
774
+ Conditional Identity Block (CIB) module.
775
+
776
+ Args:
777
+ c1 (int): Number of input channels.
778
+ c2 (int): Number of output channels.
779
+ shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
780
+ e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
781
+ lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
782
+ """
783
+
784
+ def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
785
+ """Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""
786
+ super().__init__()
787
+ c_ = int(c2 * e) # hidden channels
788
+ self.cv1 = nn.Sequential(
789
+ Conv(c1, c1, 3, g=c1),
790
+ Conv(c1, 2 * c_, 1),
791
+ RepVGGDW(2 * c_) if lk else Conv(2 * c_, 2 * c_, 3, g=2 * c_),
792
+ Conv(2 * c_, c2, 1),
793
+ Conv(c2, c2, 3, g=c2),
794
+ )
795
+
796
+ self.add = shortcut and c1 == c2
797
+
798
+ def forward(self, x):
799
+ """
800
+ Forward pass of the CIB module.
801
+
802
+ Args:
803
+ x (torch.Tensor): Input tensor.
804
+
805
+ Returns:
806
+ (torch.Tensor): Output tensor.
807
+ """
808
+ return x + self.cv1(x) if self.add else self.cv1(x)
809
+
810
+
811
+ class C2fCIB(C2f):
812
+ """
813
+ C2fCIB class represents a convolutional block with C2f and CIB modules.
814
+
815
+ Args:
816
+ c1 (int): Number of input channels.
817
+ c2 (int): Number of output channels.
818
+ n (int, optional): Number of CIB modules to stack. Defaults to 1.
819
+ shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
820
+ lk (bool, optional): Whether to use local key connection. Defaults to False.
821
+ g (int, optional): Number of groups for grouped convolution. Defaults to 1.
822
+ e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
823
+ """
824
+
825
+ def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
826
+ """Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
827
+ super().__init__(c1, c2, n, shortcut, g, e)
828
+ self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
829
+
830
+
831
+ class Attention(nn.Module):
832
+ """
833
+ Attention module that performs self-attention on the input tensor.
834
+
835
+ Args:
836
+ dim (int): The input tensor dimension.
837
+ num_heads (int): The number of attention heads.
838
+ attn_ratio (float): The ratio of the attention key dimension to the head dimension.
839
+
840
+ Attributes:
841
+ num_heads (int): The number of attention heads.
842
+ head_dim (int): The dimension of each attention head.
843
+ key_dim (int): The dimension of the attention key.
844
+ scale (float): The scaling factor for the attention scores.
845
+ qkv (Conv): Convolutional layer for computing the query, key, and value.
846
+ proj (Conv): Convolutional layer for projecting the attended values.
847
+ pe (Conv): Convolutional layer for positional encoding.
848
+ """
849
+
850
+ def __init__(self, dim, num_heads=8, attn_ratio=0.5):
851
+ """Initializes multi-head attention module with query, key, and value convolutions and positional encoding."""
852
+ super().__init__()
853
+ self.num_heads = num_heads
854
+ self.head_dim = dim // num_heads
855
+ self.key_dim = int(self.head_dim * attn_ratio)
856
+ self.scale = self.key_dim**-0.5
857
+ nh_kd = nh_kd = self.key_dim * num_heads
858
+ h = dim + nh_kd * 2
859
+ self.qkv = Conv(dim, h, 1, act=False)
860
+ self.proj = Conv(dim, dim, 1, act=False)
861
+ self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
862
+
863
+ def forward(self, x):
864
+ """
865
+ Forward pass of the Attention module.
866
+
867
+ Args:
868
+ x (torch.Tensor): The input tensor.
869
+
870
+ Returns:
871
+ (torch.Tensor): The output tensor after self-attention.
872
+ """
873
+ B, C, H, W = x.shape
874
+ N = H * W
875
+ qkv = self.qkv(x)
876
+ q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
877
+ [self.key_dim, self.key_dim, self.head_dim], dim=2
878
+ )
879
+
880
+ attn = (q.transpose(-2, -1) @ k) * self.scale
881
+ attn = attn.softmax(dim=-1)
882
+ x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
883
+ x = self.proj(x)
884
+ return x
885
+
886
+
887
+ class PSA(nn.Module):
888
+ """
889
+ Position-wise Spatial Attention module.
890
+
891
+ Args:
892
+ c1 (int): Number of input channels.
893
+ c2 (int): Number of output channels.
894
+ e (float): Expansion factor for the intermediate channels. Default is 0.5.
895
+
896
+ Attributes:
897
+ c (int): Number of intermediate channels.
898
+ cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
899
+ cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
900
+ attn (Attention): Attention module for spatial attention.
901
+ ffn (nn.Sequential): Feed-forward network module.
902
+ """
903
+
904
+ def __init__(self, c1, c2, e=0.5):
905
+ """Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
906
+ super().__init__()
907
+ assert c1 == c2
908
+ self.c = int(c1 * e)
909
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
910
+ self.cv2 = Conv(2 * self.c, c1, 1)
911
+
912
+ self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
913
+ self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
914
+
915
+ def forward(self, x):
916
+ """
917
+ Forward pass of the PSA module.
918
+
919
+ Args:
920
+ x (torch.Tensor): Input tensor.
921
+
922
+ Returns:
923
+ (torch.Tensor): Output tensor.
924
+ """
925
+ a, b = self.cv1(x).split((self.c, self.c), dim=1)
926
+ b = b + self.attn(b)
927
+ b = b + self.ffn(b)
928
+ return self.cv2(torch.cat((a, b), 1))
929
+
930
+
931
+ class SCDown(nn.Module):
932
+ def __init__(self, c1, c2, k, s):
933
+ """
934
+ Spatial Channel Downsample (SCDown) module.
935
+
936
+ Args:
937
+ c1 (int): Number of input channels.
938
+ c2 (int): Number of output channels.
939
+ k (int): Kernel size for the convolutional layer.
940
+ s (int): Stride for the convolutional layer.
941
+ """
942
+ super().__init__()
943
+ self.cv1 = Conv(c1, c2, 1, 1)
944
+ self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
945
+
946
+ def forward(self, x):
947
+ """
948
+ Forward pass of the SCDown module.
949
+
950
+ Args:
951
+ x (torch.Tensor): Input tensor.
952
+
953
+ Returns:
954
+ (torch.Tensor): Output tensor after applying the SCDown module.
955
+ """
956
+ return self.cv2(self.cv1(x))
@@ -1,6 +1,7 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
2
  """Model head modules."""
3
3
 
4
+ import copy
4
5
  import math
5
6
 
6
7
  import torch
@@ -14,7 +15,7 @@ from .conv import Conv
14
15
  from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
15
16
  from .utils import bias_init_with_prob, linear_init
16
17
 
17
- __all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
18
+ __all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect"
18
19
 
19
20
 
20
21
  class Detect(nn.Module):
@@ -22,6 +23,8 @@ class Detect(nn.Module):
22
23
 
23
24
  dynamic = False # force grid reconstruction
24
25
  export = False # export mode
26
+ end2end = False # end2end
27
+ max_det = 300 # max_det
25
28
  shape = None
26
29
  anchors = torch.empty(0) # init
27
30
  strides = torch.empty(0) # init
@@ -41,13 +44,48 @@ class Detect(nn.Module):
41
44
  self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
42
45
  self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
43
46
 
47
+ if self.end2end:
48
+ self.one2one_cv2 = copy.deepcopy(self.cv2)
49
+ self.one2one_cv3 = copy.deepcopy(self.cv3)
50
+
44
51
  def forward(self, x):
45
52
  """Concatenates and returns predicted bounding boxes and class probabilities."""
53
+ if self.end2end:
54
+ return self.forward_end2end(x)
55
+
46
56
  for i in range(self.nl):
47
57
  x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
48
58
  if self.training: # Training path
49
59
  return x
60
+ y = self._inference(x)
61
+ return y if self.export else (y, x)
62
+
63
+ def forward_end2end(self, x):
64
+ """
65
+ Performs forward pass of the v10Detect module.
50
66
 
67
+ Args:
68
+ x (tensor): Input tensor.
69
+
70
+ Returns:
71
+ (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
72
+ If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
73
+ """
74
+ x_detach = [xi.detach() for xi in x]
75
+ one2one = [
76
+ torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
77
+ ]
78
+ for i in range(self.nl):
79
+ x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
80
+ if self.training: # Training path
81
+ return {"one2many": x, "one2one": one2one}
82
+
83
+ y = self._inference(one2one)
84
+ y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
85
+ return y if self.export else (y, {"one2many": x, "one2one": one2one})
86
+
87
+ def _inference(self, x):
88
+ """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
51
89
  # Inference path
52
90
  shape = x[0].shape # BCHW
53
91
  x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
@@ -72,8 +110,7 @@ class Detect(nn.Module):
72
110
  else:
73
111
  dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
74
112
 
75
- y = torch.cat((dbox, cls.sigmoid()), 1)
76
- return y if self.export else (y, x)
113
+ return torch.cat((dbox, cls.sigmoid()), 1)
77
114
 
78
115
  def bias_init(self):
79
116
  """Initialize Detect() biases, WARNING: requires stride availability."""
@@ -83,10 +120,47 @@ class Detect(nn.Module):
83
120
  for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
84
121
  a[-1].bias.data[:] = 1.0 # box
85
122
  b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
123
+ if self.end2end:
124
+ for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride): # from
125
+ a[-1].bias.data[:] = 1.0 # box
126
+ b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
86
127
 
87
128
  def decode_bboxes(self, bboxes, anchors):
88
129
  """Decode bounding boxes."""
89
- return dist2bbox(bboxes, anchors, xywh=True, dim=1)
130
+ return dist2bbox(bboxes, anchors, xywh=not self.end2end, dim=1)
131
+
132
+ @staticmethod
133
+ def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
134
+ """
135
+ Post-processes the predictions obtained from a YOLOv10 model.
136
+
137
+ Args:
138
+ preds (torch.Tensor): The predictions obtained from the model. It should have a shape of (batch_size, num_boxes, 4 + num_classes).
139
+ max_det (int): The maximum number of detections to keep.
140
+ nc (int, optional): The number of classes. Defaults to 80.
141
+
142
+ Returns:
143
+ (torch.Tensor): The post-processed predictions with shape (batch_size, max_det, 6),
144
+ including bounding boxes, scores and cls.
145
+ """
146
+ assert 4 + nc == preds.shape[-1]
147
+ boxes, scores = preds.split([4, nc], dim=-1)
148
+ max_scores = scores.amax(dim=-1)
149
+ max_scores, index = torch.topk(max_scores, min(max_det, max_scores.shape[1]), axis=-1)
150
+ index = index.unsqueeze(-1)
151
+ boxes = torch.gather(boxes, dim=1, index=index.repeat(1, 1, boxes.shape[-1]))
152
+ scores = torch.gather(scores, dim=1, index=index.repeat(1, 1, scores.shape[-1]))
153
+
154
+ # NOTE: simplify but result slightly lower mAP
155
+ # scores, labels = scores.max(dim=-1)
156
+ # return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)
157
+
158
+ scores, index = torch.topk(scores.flatten(1), max_det, axis=-1)
159
+ labels = index % nc
160
+ index = index // nc
161
+ boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
162
+
163
+ return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1).to(boxes.dtype)], dim=-1)
90
164
 
91
165
 
92
166
  class Segment(Detect):
@@ -487,3 +561,39 @@ class RTDETRDecoder(nn.Module):
487
561
  xavier_uniform_(self.query_pos_head.layers[1].weight)
488
562
  for layer in self.input_proj:
489
563
  xavier_uniform_(layer[0].weight)
564
+
565
+
566
+ class v10Detect(Detect):
567
+ """
568
+ v10 Detection head from https://arxiv.org/pdf/2405.14458
569
+
570
+ Args:
571
+ nc (int): Number of classes.
572
+ ch (tuple): Tuple of channel sizes.
573
+
574
+ Attributes:
575
+ max_det (int): Maximum number of detections.
576
+
577
+ Methods:
578
+ __init__(self, nc=80, ch=()): Initializes the v10Detect object.
579
+ forward(self, x): Performs forward pass of the v10Detect module.
580
+ bias_init(self): Initializes biases of the Detect module.
581
+
582
+ """
583
+
584
+ end2end = True
585
+
586
+ def __init__(self, nc=80, ch=()):
587
+ """Initializes the v10Detect object with the specified number of classes and input channels."""
588
+ super().__init__(nc, ch)
589
+ c3 = max(ch[0], min(self.nc, 100)) # channels
590
+ # Light cls head
591
+ self.cv3 = nn.ModuleList(
592
+ nn.Sequential(
593
+ nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)),
594
+ nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
595
+ nn.Conv2d(c3, self.nc, 1),
596
+ )
597
+ for x in ch
598
+ )
599
+ self.one2one_cv3 = copy.deepcopy(self.cv3)
ultralytics/nn/tasks.py CHANGED
@@ -15,6 +15,7 @@ from ultralytics.nn.modules import (
15
15
  C3TR,
16
16
  ELAN1,
17
17
  OBB,
18
+ PSA,
18
19
  SPP,
19
20
  SPPELAN,
20
21
  SPPF,
@@ -24,6 +25,7 @@ from ultralytics.nn.modules import (
24
25
  BottleneckCSP,
25
26
  C2f,
26
27
  C2fAttn,
28
+ C2fCIB,
27
29
  C3Ghost,
28
30
  C3x,
29
31
  CBFuse,
@@ -46,14 +48,24 @@ from ultralytics.nn.modules import (
46
48
  RepC3,
47
49
  RepConv,
48
50
  RepNCSPELAN4,
51
+ RepVGGDW,
49
52
  ResNetLayer,
50
53
  RTDETRDecoder,
54
+ SCDown,
51
55
  Segment,
52
56
  WorldDetect,
57
+ v10Detect,
53
58
  )
54
59
  from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
55
60
  from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
56
- from ultralytics.utils.loss import v8ClassificationLoss, v8DetectionLoss, v8OBBLoss, v8PoseLoss, v8SegmentationLoss
61
+ from ultralytics.utils.loss import (
62
+ E2EDetectLoss,
63
+ v8ClassificationLoss,
64
+ v8DetectionLoss,
65
+ v8OBBLoss,
66
+ v8PoseLoss,
67
+ v8SegmentationLoss,
68
+ )
57
69
  from ultralytics.utils.plotting import feature_visualization
58
70
  from ultralytics.utils.torch_utils import (
59
71
  fuse_conv_and_bn,
@@ -192,6 +204,9 @@ class BaseModel(nn.Module):
192
204
  if isinstance(m, RepConv):
193
205
  m.fuse_convs()
194
206
  m.forward = m.forward_fuse # update forward
207
+ if isinstance(m, RepVGGDW):
208
+ m.fuse()
209
+ m.forward = m.forward_fuse
195
210
  self.info(verbose=verbose)
196
211
 
197
212
  return self
@@ -294,6 +309,7 @@ class DetectionModel(BaseModel):
294
309
  self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
295
310
  self.names = {i: f"{i}" for i in range(self.yaml["nc"])} # default names dict
296
311
  self.inplace = self.yaml.get("inplace", True)
312
+ self.end2end = getattr(self.model[-1], "end2end", False)
297
313
 
298
314
  # Build strides
299
315
  m = self.model[-1] # Detect()
@@ -303,6 +319,8 @@ class DetectionModel(BaseModel):
303
319
 
304
320
  def _forward(x):
305
321
  """Performs a forward pass through the model, handling different Detect subclass types accordingly."""
322
+ if self.end2end:
323
+ return self.forward(x)["one2many"]
306
324
  return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
307
325
 
308
326
  m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
@@ -355,7 +373,7 @@ class DetectionModel(BaseModel):
355
373
 
356
374
  def init_criterion(self):
357
375
  """Initialize the loss criterion for the DetectionModel."""
358
- return v8DetectionLoss(self)
376
+ return E2EDetectLoss(self) if self.end2end else v8DetectionLoss(self)
359
377
 
360
378
 
361
379
  class OBBModel(DetectionModel):
@@ -675,7 +693,7 @@ class Ensemble(nn.ModuleList):
675
693
 
676
694
 
677
695
  @contextlib.contextmanager
678
- def temporary_modules(modules={}, attributes={}):
696
+ def temporary_modules(modules=None, attributes=None):
679
697
  """
680
698
  Context manager for temporarily adding or modifying modules in Python's module cache (`sys.modules`).
681
699
 
@@ -689,8 +707,8 @@ def temporary_modules(modules={}, attributes={}):
689
707
 
690
708
  Example:
691
709
  ```python
692
- with temporary_modules({'old.module.path': 'new.module.path'}, {'old.module.attribute': 'new.module.attribute'}):
693
- import old.module.path # this will now import new.module.path
710
+ with temporary_modules({'old.module': 'new.module'}, {'old.module.attribute': 'new.module.attribute'}):
711
+ import old.module # this will now import new.module
694
712
  from old.module import attribute # this will now import new.module.attribute
695
713
  ```
696
714
 
@@ -700,23 +718,23 @@ def temporary_modules(modules={}, attributes={}):
700
718
  applications or libraries. Use this function with caution.
701
719
  """
702
720
 
703
- import importlib
721
+ if modules is None:
722
+ modules = {}
723
+ if attributes is None:
724
+ attributes = {}
704
725
  import sys
726
+ from importlib import import_module
705
727
 
706
728
  try:
707
729
  # Set attributes in sys.modules under their old name
708
730
  for old, new in attributes.items():
709
731
  old_module, old_attr = old.rsplit(".", 1)
710
732
  new_module, new_attr = new.rsplit(".", 1)
711
- setattr(
712
- importlib.import_module(old_module),
713
- old_attr,
714
- getattr(importlib.import_module(new_module), new_attr),
715
- )
733
+ setattr(import_module(old_module), old_attr, getattr(import_module(new_module), new_attr))
716
734
 
717
735
  # Set modules in sys.modules under their old name
718
736
  for old, new in modules.items():
719
- sys.modules[old] = importlib.import_module(new)
737
+ sys.modules[old] = import_module(new)
720
738
 
721
739
  yield
722
740
  finally:
@@ -750,9 +768,10 @@ def torch_safe_load(weight):
750
768
  "ultralytics.yolo.data": "ultralytics.data",
751
769
  },
752
770
  attributes={
753
- "ultralytics.nn.modules.block.Silence": "torch.nn.Identity",
771
+ "ultralytics.nn.modules.block.Silence": "torch.nn.Identity", # YOLOv9e
772
+ "ultralytics.nn.tasks.YOLOv10DetectionModel": "ultralytics.nn.tasks.DetectionModel", # YOLOv10
754
773
  },
755
- ): # for legacy 8.0 Classify and Pose models
774
+ ):
756
775
  ckpt = torch.load(file, map_location="cpu")
757
776
 
758
777
  except ModuleNotFoundError as e: # e.name is missing module name
@@ -911,6 +930,9 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
911
930
  DWConvTranspose2d,
912
931
  C3x,
913
932
  RepC3,
933
+ PSA,
934
+ SCDown,
935
+ C2fCIB,
914
936
  }:
915
937
  c1, c2 = ch[f], args[0]
916
938
  if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
@@ -922,7 +944,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
922
944
  ) # num heads
923
945
 
924
946
  args = [c1, c2, *args[1:]]
925
- if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3}:
947
+ if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3, C2fCIB}:
926
948
  args.insert(2, n) # number of repeats
927
949
  n = 1
928
950
  elif m is AIFI:
@@ -939,7 +961,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
939
961
  args = [ch[f]]
940
962
  elif m is Concat:
941
963
  c2 = sum(ch[x] for x in f)
942
- elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn}:
964
+ elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn, v10Detect}:
943
965
  args.append([ch[x] for x in f])
944
966
  if m is Segment:
945
967
  args[2] = make_divisible(min(args[2], max_channels) * width, 8)
@@ -1024,7 +1046,7 @@ def guess_model_task(model):
1024
1046
  m = cfg["head"][-1][-2].lower() # output module name
1025
1047
  if m in {"classify", "classifier", "cls", "fc"}:
1026
1048
  return "classify"
1027
- if m == "detect":
1049
+ if "detect" in m:
1028
1050
  return "detect"
1029
1051
  if m == "segment":
1030
1052
  return "segment"
@@ -1056,7 +1078,7 @@ def guess_model_task(model):
1056
1078
  return "pose"
1057
1079
  elif isinstance(m, OBB):
1058
1080
  return "obb"
1059
- elif isinstance(m, (Detect, WorldDetect)):
1081
+ elif isinstance(m, (Detect, WorldDetect, v10Detect)):
1060
1082
  return "detect"
1061
1083
 
1062
1084
  # Guess from model filename
@@ -15,6 +15,7 @@ __all__ = (
15
15
  "Heatmap",
16
16
  "ObjectCounter",
17
17
  "ParkingManagement",
18
+ "ParkingPtsSelection",
18
19
  "QueueManager",
19
20
  "SpeedEstimator",
20
21
  "Analytics",
@@ -1070,7 +1070,7 @@ TESTS_RUNNING = is_pytest_running() or is_github_action_running()
1070
1070
  set_sentry()
1071
1071
 
1072
1072
  # Apply monkey patches
1073
- from .patches import imread, imshow, imwrite, torch_save
1073
+ from ultralytics.utils.patches import imread, imshow, imwrite, torch_save
1074
1074
 
1075
1075
  torch.save = torch_save
1076
1076
  if WINDOWS:
@@ -81,6 +81,7 @@ def benchmark(
81
81
  device = select_device(device, verbose=False)
82
82
  if isinstance(model, (str, Path)):
83
83
  model = YOLO(model)
84
+ is_end2end = getattr(model.model.model[-1], "end2end", False)
84
85
 
85
86
  y = []
86
87
  t0 = time.time()
@@ -96,14 +97,18 @@ def benchmark(
96
97
  assert MACOS or LINUX, "CoreML and TF.js export only supported on macOS and Linux"
97
98
  assert not IS_RASPBERRYPI, "CoreML and TF.js export not supported on Raspberry Pi"
98
99
  assert not IS_JETSON, "CoreML and TF.js export not supported on NVIDIA Jetson"
100
+ assert not is_end2end, "End-to-end models not supported by CoreML and TF.js yet"
99
101
  if i in {3, 5}: # CoreML and OpenVINO
100
102
  assert not IS_PYTHON_3_12, "CoreML and OpenVINO not supported on Python 3.12"
101
103
  if i in {6, 7, 8, 9, 10}: # All TF formats
102
104
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
105
+ assert not is_end2end, "End-to-end models not supported by onnx2tf yet"
103
106
  if i in {11}: # Paddle
104
107
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
108
+ assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
105
109
  if i in {12}: # NCNN
106
110
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 NCNN exports not supported yet"
111
+ assert not is_end2end, "End-to-end models not supported by NCNN yet"
107
112
  if "cpu" in device.type:
108
113
  assert cpu, "inference not supported on CPU"
109
114
  if "cuda" in device.type:
@@ -23,6 +23,7 @@ GITHUB_ASSETS_NAMES = (
23
23
  + [f"yolov8{k}-world.pt" for k in "smlx"]
24
24
  + [f"yolov8{k}-worldv2.pt" for k in "smlx"]
25
25
  + [f"yolov9{k}.pt" for k in "ce"]
26
+ + [f"yolov10{k}.pt" for k in "nsmblx"]
26
27
  + [f"yolo_nas_{k}.pt" for k in "sml"]
27
28
  + [f"sam_{k}.pt" for k in "bl"]
28
29
  + [f"FastSAM-{k}.pt" for k in "sx"]