ultralytics 8.2.37__py3-none-any.whl → 8.2.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultralytics might be problematic. Click here for more details.

tests/test_python.py CHANGED
@@ -577,3 +577,12 @@ def test_yolo_world():
577
577
  close_mosaic=1,
578
578
  trainer=WorldTrainerFromScratch,
579
579
  )
580
+
581
+
582
+ def test_yolov10():
583
+ """A simple test for yolov10 for now."""
584
+ model = YOLO("yolov10n.yaml")
585
+ # train/val/predict
586
+ model.train(data="coco8.yaml", epochs=1, imgsz=32, close_mosaic=1, cache="disk")
587
+ model.val(data="coco8.yaml", imgsz=32)
588
+ model(SOURCE)
ultralytics/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
2
 
3
- __version__ = "8.2.37"
3
+ __version__ = "8.2.38"
4
4
 
5
5
  import os
6
6
 
@@ -0,0 +1,42 @@
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv10 object detection model. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ b: [0.67, 1.00, 512]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
13
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
14
+ - [-1, 3, C2f, [128, True]]
15
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
16
+ - [-1, 6, C2f, [256, True]]
17
+ - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
18
+ - [-1, 6, C2f, [512, True]]
19
+ - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
20
+ - [-1, 3, C2fCIB, [1024, True]]
21
+ - [-1, 1, SPPF, [1024, 5]] # 9
22
+ - [-1, 1, PSA, [1024]] # 10
23
+
24
+ # YOLOv8.0n head
25
+ head:
26
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
27
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
28
+ - [-1, 3, C2fCIB, [512, True]] # 13
29
+
30
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
31
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
32
+ - [-1, 3, C2f, [256]] # 16 (P3/8-small)
33
+
34
+ - [-1, 1, Conv, [256, 3, 2]]
35
+ - [[-1, 13], 1, Concat, [1]] # cat head P4
36
+ - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
37
+
38
+ - [-1, 1, SCDown, [512, 3, 2]]
39
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
40
+ - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
41
+
42
+ - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
@@ -0,0 +1,42 @@
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv10 object detection model. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ l: [1.00, 1.00, 512]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
13
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
14
+ - [-1, 3, C2f, [128, True]]
15
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
16
+ - [-1, 6, C2f, [256, True]]
17
+ - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
18
+ - [-1, 6, C2f, [512, True]]
19
+ - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
20
+ - [-1, 3, C2fCIB, [1024, True]]
21
+ - [-1, 1, SPPF, [1024, 5]] # 9
22
+ - [-1, 1, PSA, [1024]] # 10
23
+
24
+ # YOLOv8.0n head
25
+ head:
26
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
27
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
28
+ - [-1, 3, C2fCIB, [512, True]] # 13
29
+
30
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
31
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
32
+ - [-1, 3, C2f, [256]] # 16 (P3/8-small)
33
+
34
+ - [-1, 1, Conv, [256, 3, 2]]
35
+ - [[-1, 13], 1, Concat, [1]] # cat head P4
36
+ - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
37
+
38
+ - [-1, 1, SCDown, [512, 3, 2]]
39
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
40
+ - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
41
+
42
+ - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
@@ -0,0 +1,42 @@
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv10 object detection model. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ m: [0.67, 0.75, 768]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
13
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
14
+ - [-1, 3, C2f, [128, True]]
15
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
16
+ - [-1, 6, C2f, [256, True]]
17
+ - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
18
+ - [-1, 6, C2f, [512, True]]
19
+ - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
20
+ - [-1, 3, C2fCIB, [1024, True]]
21
+ - [-1, 1, SPPF, [1024, 5]] # 9
22
+ - [-1, 1, PSA, [1024]] # 10
23
+
24
+ # YOLOv8.0n head
25
+ head:
26
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
27
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
28
+ - [-1, 3, C2f, [512]] # 13
29
+
30
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
31
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
32
+ - [-1, 3, C2f, [256]] # 16 (P3/8-small)
33
+
34
+ - [-1, 1, Conv, [256, 3, 2]]
35
+ - [[-1, 13], 1, Concat, [1]] # cat head P4
36
+ - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
37
+
38
+ - [-1, 1, SCDown, [512, 3, 2]]
39
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
40
+ - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
41
+
42
+ - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
@@ -0,0 +1,42 @@
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv10 object detection model. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ n: [0.33, 0.25, 1024]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
13
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
14
+ - [-1, 3, C2f, [128, True]]
15
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
16
+ - [-1, 6, C2f, [256, True]]
17
+ - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
18
+ - [-1, 6, C2f, [512, True]]
19
+ - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
20
+ - [-1, 3, C2f, [1024, True]]
21
+ - [-1, 1, SPPF, [1024, 5]] # 9
22
+ - [-1, 1, PSA, [1024]] # 10
23
+
24
+ # YOLOv8.0n head
25
+ head:
26
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
27
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
28
+ - [-1, 3, C2f, [512]] # 13
29
+
30
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
31
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
32
+ - [-1, 3, C2f, [256]] # 16 (P3/8-small)
33
+
34
+ - [-1, 1, Conv, [256, 3, 2]]
35
+ - [[-1, 13], 1, Concat, [1]] # cat head P4
36
+ - [-1, 3, C2f, [512]] # 19 (P4/16-medium)
37
+
38
+ - [-1, 1, SCDown, [512, 3, 2]]
39
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
40
+ - [-1, 3, C2fCIB, [1024, True, True]] # 22 (P5/32-large)
41
+
42
+ - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
@@ -0,0 +1,42 @@
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv10 object detection model. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ s: [0.33, 0.50, 1024]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
13
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
14
+ - [-1, 3, C2f, [128, True]]
15
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
16
+ - [-1, 6, C2f, [256, True]]
17
+ - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
18
+ - [-1, 6, C2f, [512, True]]
19
+ - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
20
+ - [-1, 3, C2fCIB, [1024, True, True]]
21
+ - [-1, 1, SPPF, [1024, 5]] # 9
22
+ - [-1, 1, PSA, [1024]] # 10
23
+
24
+ # YOLOv8.0n head
25
+ head:
26
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
27
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
28
+ - [-1, 3, C2f, [512]] # 13
29
+
30
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
31
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
32
+ - [-1, 3, C2f, [256]] # 16 (P3/8-small)
33
+
34
+ - [-1, 1, Conv, [256, 3, 2]]
35
+ - [[-1, 13], 1, Concat, [1]] # cat head P4
36
+ - [-1, 3, C2f, [512]] # 19 (P4/16-medium)
37
+
38
+ - [-1, 1, SCDown, [512, 3, 2]]
39
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
40
+ - [-1, 3, C2fCIB, [1024, True, True]] # 22 (P5/32-large)
41
+
42
+ - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
@@ -0,0 +1,42 @@
1
+ # Ultralytics YOLO 🚀, AGPL-3.0 license
2
+ # YOLOv10 object detection model. For Usage examples see https://docs.ultralytics.com/tasks/detect
3
+
4
+ # Parameters
5
+ nc: 80 # number of classes
6
+ scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
7
+ # [depth, width, max_channels]
8
+ x: [1.00, 1.25, 512]
9
+
10
+ backbone:
11
+ # [from, repeats, module, args]
12
+ - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
13
+ - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
14
+ - [-1, 3, C2f, [128, True]]
15
+ - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
16
+ - [-1, 6, C2f, [256, True]]
17
+ - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
18
+ - [-1, 6, C2fCIB, [512, True]]
19
+ - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
20
+ - [-1, 3, C2fCIB, [1024, True]]
21
+ - [-1, 1, SPPF, [1024, 5]] # 9
22
+ - [-1, 1, PSA, [1024]] # 10
23
+
24
+ # YOLOv8.0n head
25
+ head:
26
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
27
+ - [[-1, 6], 1, Concat, [1]] # cat backbone P4
28
+ - [-1, 3, C2fCIB, [512, True]] # 13
29
+
30
+ - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
31
+ - [[-1, 4], 1, Concat, [1]] # cat backbone P3
32
+ - [-1, 3, C2f, [256]] # 16 (P3/8-small)
33
+
34
+ - [-1, 1, Conv, [256, 3, 2]]
35
+ - [[-1, 13], 1, Concat, [1]] # cat head P4
36
+ - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
37
+
38
+ - [-1, 1, SCDown, [512, 3, 2]]
39
+ - [[-1, 10], 1, Concat, [1]] # cat head P5
40
+ - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
41
+
42
+ - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
@@ -920,6 +920,7 @@ class Exporter:
920
920
  @try_export
921
921
  def export_tflite(self, keras_model, nms, agnostic_nms, prefix=colorstr("TensorFlow Lite:")):
922
922
  """YOLOv8 TensorFlow Lite export."""
923
+ # BUG https://github.com/ultralytics/ultralytics/issues/13436
923
924
  import tensorflow as tf # noqa
924
925
 
925
926
  LOGGER.info(f"\n{prefix} starting export with tensorflow {tf.__version__}...")
@@ -22,18 +22,22 @@ from .block import (
22
22
  C2,
23
23
  C3,
24
24
  C3TR,
25
+ CIB,
25
26
  DFL,
26
27
  ELAN1,
28
+ PSA,
27
29
  SPP,
28
30
  SPPELAN,
29
31
  SPPF,
30
32
  AConv,
31
33
  ADown,
34
+ Attention,
32
35
  BNContrastiveHead,
33
36
  Bottleneck,
34
37
  BottleneckCSP,
35
38
  C2f,
36
39
  C2fAttn,
40
+ C2fCIB,
37
41
  C3Ghost,
38
42
  C3x,
39
43
  CBFuse,
@@ -46,7 +50,9 @@ from .block import (
46
50
  Proto,
47
51
  RepC3,
48
52
  RepNCSPELAN4,
53
+ RepVGGDW,
49
54
  ResNetLayer,
55
+ SCDown,
50
56
  )
51
57
  from .conv import (
52
58
  CBAM,
@@ -63,7 +69,7 @@ from .conv import (
63
69
  RepConv,
64
70
  SpatialAttention,
65
71
  )
66
- from .head import OBB, Classify, Detect, Pose, RTDETRDecoder, Segment, WorldDetect
72
+ from .head import OBB, Classify, Detect, Pose, RTDETRDecoder, Segment, WorldDetect, v10Detect
67
73
  from .transformer import (
68
74
  AIFI,
69
75
  MLP,
@@ -137,4 +143,10 @@ __all__ = (
137
143
  "CBLinear",
138
144
  "AConv",
139
145
  "ELAN1",
146
+ "RepVGGDW",
147
+ "CIB",
148
+ "C2fCIB",
149
+ "Attention",
150
+ "PSA",
151
+ "SCDown",
140
152
  )
@@ -5,6 +5,8 @@ import torch
5
5
  import torch.nn as nn
6
6
  import torch.nn.functional as F
7
7
 
8
+ from ultralytics.utils.torch_utils import fuse_conv_and_bn
9
+
8
10
  from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
9
11
  from .transformer import TransformerBlock
10
12
 
@@ -39,6 +41,12 @@ __all__ = (
39
41
  "CBFuse",
40
42
  "CBLinear",
41
43
  "Silence",
44
+ "RepVGGDW",
45
+ "CIB",
46
+ "C2fCIB",
47
+ "Attention",
48
+ "PSA",
49
+ "SCDown",
42
50
  )
43
51
 
44
52
 
@@ -699,3 +707,251 @@ class CBFuse(nn.Module):
699
707
  target_size = xs[-1].shape[2:]
700
708
  res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
701
709
  return torch.sum(torch.stack(res + xs[-1:]), dim=0)
710
+
711
+
712
+ class RepVGGDW(torch.nn.Module):
713
+ """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
714
+
715
+ def __init__(self, ed) -> None:
716
+ super().__init__()
717
+ self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
718
+ self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
719
+ self.dim = ed
720
+ self.act = nn.SiLU()
721
+
722
+ def forward(self, x):
723
+ """
724
+ Performs a forward pass of the RepVGGDW block.
725
+
726
+ Args:
727
+ x (torch.Tensor): Input tensor.
728
+
729
+ Returns:
730
+ (torch.Tensor): Output tensor after applying the depth wise separable convolution.
731
+ """
732
+ return self.act(self.conv(x) + self.conv1(x))
733
+
734
+ def forward_fuse(self, x):
735
+ """
736
+ Performs a forward pass of the RepVGGDW block without fusing the convolutions.
737
+
738
+ Args:
739
+ x (torch.Tensor): Input tensor.
740
+
741
+ Returns:
742
+ (torch.Tensor): Output tensor after applying the depth wise separable convolution.
743
+ """
744
+ return self.act(self.conv(x))
745
+
746
+ @torch.no_grad()
747
+ def fuse(self):
748
+ """
749
+ Fuses the convolutional layers in the RepVGGDW block.
750
+
751
+ This method fuses the convolutional layers and updates the weights and biases accordingly.
752
+ """
753
+ conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
754
+ conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)
755
+
756
+ conv_w = conv.weight
757
+ conv_b = conv.bias
758
+ conv1_w = conv1.weight
759
+ conv1_b = conv1.bias
760
+
761
+ conv1_w = torch.nn.functional.pad(conv1_w, [2, 2, 2, 2])
762
+
763
+ final_conv_w = conv_w + conv1_w
764
+ final_conv_b = conv_b + conv1_b
765
+
766
+ conv.weight.data.copy_(final_conv_w)
767
+ conv.bias.data.copy_(final_conv_b)
768
+
769
+ self.conv = conv
770
+ del self.conv1
771
+
772
+
773
+ class CIB(nn.Module):
774
+ """
775
+ Conditional Identity Block (CIB) module.
776
+
777
+ Args:
778
+ c1 (int): Number of input channels.
779
+ c2 (int): Number of output channels.
780
+ shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
781
+ e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
782
+ lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
783
+ """
784
+
785
+ def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
786
+ """Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""
787
+ super().__init__()
788
+ c_ = int(c2 * e) # hidden channels
789
+ self.cv1 = nn.Sequential(
790
+ Conv(c1, c1, 3, g=c1),
791
+ Conv(c1, 2 * c_, 1),
792
+ Conv(2 * c_, 2 * c_, 3, g=2 * c_) if not lk else RepVGGDW(2 * c_),
793
+ Conv(2 * c_, c2, 1),
794
+ Conv(c2, c2, 3, g=c2),
795
+ )
796
+
797
+ self.add = shortcut and c1 == c2
798
+
799
+ def forward(self, x):
800
+ """
801
+ Forward pass of the CIB module.
802
+
803
+ Args:
804
+ x (torch.Tensor): Input tensor.
805
+
806
+ Returns:
807
+ (torch.Tensor): Output tensor.
808
+ """
809
+ return x + self.cv1(x) if self.add else self.cv1(x)
810
+
811
+
812
+ class C2fCIB(C2f):
813
+ """
814
+ C2fCIB class represents a convolutional block with C2f and CIB modules.
815
+
816
+ Args:
817
+ c1 (int): Number of input channels.
818
+ c2 (int): Number of output channels.
819
+ n (int, optional): Number of CIB modules to stack. Defaults to 1.
820
+ shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
821
+ lk (bool, optional): Whether to use local key connection. Defaults to False.
822
+ g (int, optional): Number of groups for grouped convolution. Defaults to 1.
823
+ e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
824
+ """
825
+
826
+ def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
827
+ """Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
828
+ super().__init__(c1, c2, n, shortcut, g, e)
829
+ self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
830
+
831
+
832
+ class Attention(nn.Module):
833
+ """
834
+ Attention module that performs self-attention on the input tensor.
835
+
836
+ Args:
837
+ dim (int): The input tensor dimension.
838
+ num_heads (int): The number of attention heads.
839
+ attn_ratio (float): The ratio of the attention key dimension to the head dimension.
840
+
841
+ Attributes:
842
+ num_heads (int): The number of attention heads.
843
+ head_dim (int): The dimension of each attention head.
844
+ key_dim (int): The dimension of the attention key.
845
+ scale (float): The scaling factor for the attention scores.
846
+ qkv (Conv): Convolutional layer for computing the query, key, and value.
847
+ proj (Conv): Convolutional layer for projecting the attended values.
848
+ pe (Conv): Convolutional layer for positional encoding.
849
+ """
850
+
851
+ def __init__(self, dim, num_heads=8, attn_ratio=0.5):
852
+ """Initializes multi-head attention module with query, key, and value convolutions and positional encoding."""
853
+ super().__init__()
854
+ self.num_heads = num_heads
855
+ self.head_dim = dim // num_heads
856
+ self.key_dim = int(self.head_dim * attn_ratio)
857
+ self.scale = self.key_dim**-0.5
858
+ nh_kd = nh_kd = self.key_dim * num_heads
859
+ h = dim + nh_kd * 2
860
+ self.qkv = Conv(dim, h, 1, act=False)
861
+ self.proj = Conv(dim, dim, 1, act=False)
862
+ self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
863
+
864
+ def forward(self, x):
865
+ """
866
+ Forward pass of the Attention module.
867
+
868
+ Args:
869
+ x (torch.Tensor): The input tensor.
870
+
871
+ Returns:
872
+ (torch.Tensor): The output tensor after self-attention.
873
+ """
874
+ B, C, H, W = x.shape
875
+ N = H * W
876
+ qkv = self.qkv(x)
877
+ q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
878
+ [self.key_dim, self.key_dim, self.head_dim], dim=2
879
+ )
880
+
881
+ attn = (q.transpose(-2, -1) @ k) * self.scale
882
+ attn = attn.softmax(dim=-1)
883
+ x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
884
+ x = self.proj(x)
885
+ return x
886
+
887
+
888
+ class PSA(nn.Module):
889
+ """
890
+ Position-wise Spatial Attention module.
891
+
892
+ Args:
893
+ c1 (int): Number of input channels.
894
+ c2 (int): Number of output channels.
895
+ e (float): Expansion factor for the intermediate channels. Default is 0.5.
896
+
897
+ Attributes:
898
+ c (int): Number of intermediate channels.
899
+ cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
900
+ cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
901
+ attn (Attention): Attention module for spatial attention.
902
+ ffn (nn.Sequential): Feed-forward network module.
903
+ """
904
+
905
+ def __init__(self, c1, c2, e=0.5):
906
+ """Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
907
+ super().__init__()
908
+ assert c1 == c2
909
+ self.c = int(c1 * e)
910
+ self.cv1 = Conv(c1, 2 * self.c, 1, 1)
911
+ self.cv2 = Conv(2 * self.c, c1, 1)
912
+
913
+ self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
914
+ self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
915
+
916
+ def forward(self, x):
917
+ """
918
+ Forward pass of the PSA module.
919
+
920
+ Args:
921
+ x (torch.Tensor): Input tensor.
922
+
923
+ Returns:
924
+ (torch.Tensor): Output tensor.
925
+ """
926
+ a, b = self.cv1(x).split((self.c, self.c), dim=1)
927
+ b = b + self.attn(b)
928
+ b = b + self.ffn(b)
929
+ return self.cv2(torch.cat((a, b), 1))
930
+
931
+
932
+ class SCDown(nn.Module):
933
+ def __init__(self, c1, c2, k, s):
934
+ """
935
+ Spatial Channel Downsample (SCDown) module.
936
+
937
+ Args:
938
+ c1 (int): Number of input channels.
939
+ c2 (int): Number of output channels.
940
+ k (int): Kernel size for the convolutional layer.
941
+ s (int): Stride for the convolutional layer.
942
+ """
943
+ super().__init__()
944
+ self.cv1 = Conv(c1, c2, 1, 1)
945
+ self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
946
+
947
+ def forward(self, x):
948
+ """
949
+ Forward pass of the SCDown module.
950
+
951
+ Args:
952
+ x (torch.Tensor): Input tensor.
953
+
954
+ Returns:
955
+ (torch.Tensor): Output tensor after applying the SCDown module.
956
+ """
957
+ return self.cv2(self.cv1(x))
@@ -1,6 +1,7 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
2
  """Model head modules."""
3
3
 
4
+ import copy
4
5
  import math
5
6
 
6
7
  import torch
@@ -14,7 +15,7 @@ from .conv import Conv
14
15
  from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
15
16
  from .utils import bias_init_with_prob, linear_init
16
17
 
17
- __all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"
18
+ __all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect"
18
19
 
19
20
 
20
21
  class Detect(nn.Module):
@@ -22,6 +23,8 @@ class Detect(nn.Module):
22
23
 
23
24
  dynamic = False # force grid reconstruction
24
25
  export = False # export mode
26
+ end2end = False # end2end
27
+ max_det = 300 # max_det
25
28
  shape = None
26
29
  anchors = torch.empty(0) # init
27
30
  strides = torch.empty(0) # init
@@ -41,13 +44,48 @@ class Detect(nn.Module):
41
44
  self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
42
45
  self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
43
46
 
47
+ if self.end2end:
48
+ self.one2one_cv2 = copy.deepcopy(self.cv2)
49
+ self.one2one_cv3 = copy.deepcopy(self.cv3)
50
+
44
51
  def forward(self, x):
45
52
  """Concatenates and returns predicted bounding boxes and class probabilities."""
53
+ if self.end2end:
54
+ return self.forward_end2end(x)
55
+
46
56
  for i in range(self.nl):
47
57
  x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
48
58
  if self.training: # Training path
49
59
  return x
60
+ y = self._inference(x)
61
+ return y if self.export else (y, x)
62
+
63
+ def forward_end2end(self, x):
64
+ """
65
+ Performs forward pass of the v10Detect module.
66
+
67
+ Args:
68
+ x (tensor): Input tensor.
69
+
70
+ Returns:
71
+ (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
72
+ If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
73
+ """
74
+ x_detach = [xi.detach() for xi in x]
75
+ one2one = [
76
+ torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
77
+ ]
78
+ for i in range(self.nl):
79
+ x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
80
+ if self.training: # Training path
81
+ return {"one2many": x, "one2one": one2one}
50
82
 
83
+ y = self._inference(one2one)
84
+ y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
85
+ return y if self.export else (y, {"one2many": x, "one2one": one2one})
86
+
87
+ def _inference(self, x):
88
+ """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
51
89
  # Inference path
52
90
  shape = x[0].shape # BCHW
53
91
  x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
@@ -73,7 +111,7 @@ class Detect(nn.Module):
73
111
  dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
74
112
 
75
113
  y = torch.cat((dbox, cls.sigmoid()), 1)
76
- return y if self.export else (y, x)
114
+ return y
77
115
 
78
116
  def bias_init(self):
79
117
  """Initialize Detect() biases, WARNING: requires stride availability."""
@@ -83,10 +121,47 @@ class Detect(nn.Module):
83
121
  for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
84
122
  a[-1].bias.data[:] = 1.0 # box
85
123
  b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
124
+ if self.end2end:
125
+ for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride): # from
126
+ a[-1].bias.data[:] = 1.0 # box
127
+ b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
86
128
 
87
129
  def decode_bboxes(self, bboxes, anchors):
88
130
  """Decode bounding boxes."""
89
- return dist2bbox(bboxes, anchors, xywh=True, dim=1)
131
+ return dist2bbox(bboxes, anchors, xywh=not self.end2end, dim=1)
132
+
133
+ @staticmethod
134
+ def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
135
+ """
136
+ Post-processes the predictions obtained from a YOLOv10 model.
137
+
138
+ Args:
139
+ preds (torch.Tensor): The predictions obtained from the model. It should have a shape of (batch_size, num_boxes, 4 + num_classes).
140
+ max_det (int): The maximum number of detections to keep.
141
+ nc (int, optional): The number of classes. Defaults to 80.
142
+
143
+ Returns:
144
+ (torch.Tensor): The post-processed predictions with shape (batch_size, max_det, 6),
145
+ including bounding boxes, scores and cls.
146
+ """
147
+ assert 4 + nc == preds.shape[-1]
148
+ boxes, scores = preds.split([4, nc], dim=-1)
149
+ max_scores = scores.amax(dim=-1)
150
+ max_scores, index = torch.topk(max_scores, min(max_det, max_scores.shape[1]), axis=-1)
151
+ index = index.unsqueeze(-1)
152
+ boxes = torch.gather(boxes, dim=1, index=index.repeat(1, 1, boxes.shape[-1]))
153
+ scores = torch.gather(scores, dim=1, index=index.repeat(1, 1, scores.shape[-1]))
154
+
155
+ # NOTE: simplify but result slightly lower mAP
156
+ # scores, labels = scores.max(dim=-1)
157
+ # return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)
158
+
159
+ scores, index = torch.topk(scores.flatten(1), max_det, axis=-1)
160
+ labels = index % nc
161
+ index = index // nc
162
+ boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
163
+
164
+ return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1).to(boxes.dtype)], dim=-1)
90
165
 
91
166
 
92
167
  class Segment(Detect):
@@ -487,3 +562,39 @@ class RTDETRDecoder(nn.Module):
487
562
  xavier_uniform_(self.query_pos_head.layers[1].weight)
488
563
  for layer in self.input_proj:
489
564
  xavier_uniform_(layer[0].weight)
565
+
566
+
567
+ class v10Detect(Detect):
568
+ """
569
+ v10 Detection head from https://arxiv.org/pdf/2405.14458
570
+
571
+ Args:
572
+ nc (int): Number of classes.
573
+ ch (tuple): Tuple of channel sizes.
574
+
575
+ Attributes:
576
+ max_det (int): Maximum number of detections.
577
+
578
+ Methods:
579
+ __init__(self, nc=80, ch=()): Initializes the v10Detect object.
580
+ forward(self, x): Performs forward pass of the v10Detect module.
581
+ bias_init(self): Initializes biases of the Detect module.
582
+
583
+ """
584
+
585
+ end2end = True
586
+
587
+ def __init__(self, nc=80, ch=()):
588
+ """Initializes the v10Detect object with the specified number of classes and input channels."""
589
+ super().__init__(nc, ch)
590
+ c3 = max(ch[0], min(self.nc, 100)) # channels
591
+ # Light cls head
592
+ self.cv3 = nn.ModuleList(
593
+ nn.Sequential(
594
+ nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)),
595
+ nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
596
+ nn.Conv2d(c3, self.nc, 1),
597
+ )
598
+ for x in ch
599
+ )
600
+ self.one2one_cv3 = copy.deepcopy(self.cv3)
ultralytics/nn/tasks.py CHANGED
@@ -15,6 +15,7 @@ from ultralytics.nn.modules import (
15
15
  C3TR,
16
16
  ELAN1,
17
17
  OBB,
18
+ PSA,
18
19
  SPP,
19
20
  SPPELAN,
20
21
  SPPF,
@@ -24,6 +25,7 @@ from ultralytics.nn.modules import (
24
25
  BottleneckCSP,
25
26
  C2f,
26
27
  C2fAttn,
28
+ C2fCIB,
27
29
  C3Ghost,
28
30
  C3x,
29
31
  CBFuse,
@@ -46,14 +48,24 @@ from ultralytics.nn.modules import (
46
48
  RepC3,
47
49
  RepConv,
48
50
  RepNCSPELAN4,
51
+ RepVGGDW,
49
52
  ResNetLayer,
50
53
  RTDETRDecoder,
54
+ SCDown,
51
55
  Segment,
52
56
  WorldDetect,
57
+ v10Detect,
53
58
  )
54
59
  from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
55
60
  from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
56
- from ultralytics.utils.loss import v8ClassificationLoss, v8DetectionLoss, v8OBBLoss, v8PoseLoss, v8SegmentationLoss
61
+ from ultralytics.utils.loss import (
62
+ E2EDetectLoss,
63
+ v8ClassificationLoss,
64
+ v8DetectionLoss,
65
+ v8OBBLoss,
66
+ v8PoseLoss,
67
+ v8SegmentationLoss,
68
+ )
57
69
  from ultralytics.utils.plotting import feature_visualization
58
70
  from ultralytics.utils.torch_utils import (
59
71
  fuse_conv_and_bn,
@@ -192,6 +204,9 @@ class BaseModel(nn.Module):
192
204
  if isinstance(m, RepConv):
193
205
  m.fuse_convs()
194
206
  m.forward = m.forward_fuse # update forward
207
+ if isinstance(m, RepVGGDW):
208
+ m.fuse()
209
+ m.forward = m.forward_fuse
195
210
  self.info(verbose=verbose)
196
211
 
197
212
  return self
@@ -294,6 +309,7 @@ class DetectionModel(BaseModel):
294
309
  self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
295
310
  self.names = {i: f"{i}" for i in range(self.yaml["nc"])} # default names dict
296
311
  self.inplace = self.yaml.get("inplace", True)
312
+ self.end2end = getattr(self.model[-1], "end2end", False)
297
313
 
298
314
  # Build strides
299
315
  m = self.model[-1] # Detect()
@@ -303,6 +319,8 @@ class DetectionModel(BaseModel):
303
319
 
304
320
  def _forward(x):
305
321
  """Performs a forward pass through the model, handling different Detect subclass types accordingly."""
322
+ if self.end2end:
323
+ return self.forward(x)["one2many"]
306
324
  return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
307
325
 
308
326
  m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
@@ -355,7 +373,7 @@ class DetectionModel(BaseModel):
355
373
 
356
374
  def init_criterion(self):
357
375
  """Initialize the loss criterion for the DetectionModel."""
358
- return v8DetectionLoss(self)
376
+ return E2EDetectLoss(self) if self.end2end else v8DetectionLoss(self)
359
377
 
360
378
 
361
379
  class OBBModel(DetectionModel):
@@ -689,8 +707,8 @@ def temporary_modules(modules={}, attributes={}):
689
707
 
690
708
  Example:
691
709
  ```python
692
- with temporary_modules({'old.module.path': 'new.module.path'}, {'old.module.attribute': 'new.module.attribute'}):
693
- import old.module.path # this will now import new.module.path
710
+ with temporary_modules({'old.module': 'new.module'}, {'old.module.attribute': 'new.module.attribute'}):
711
+ import old.module # this will now import new.module
694
712
  from old.module import attribute # this will now import new.module.attribute
695
713
  ```
696
714
 
@@ -700,23 +718,19 @@ def temporary_modules(modules={}, attributes={}):
700
718
  applications or libraries. Use this function with caution.
701
719
  """
702
720
 
703
- import importlib
704
721
  import sys
722
+ from importlib import import_module
705
723
 
706
724
  try:
707
725
  # Set attributes in sys.modules under their old name
708
726
  for old, new in attributes.items():
709
727
  old_module, old_attr = old.rsplit(".", 1)
710
728
  new_module, new_attr = new.rsplit(".", 1)
711
- setattr(
712
- importlib.import_module(old_module),
713
- old_attr,
714
- getattr(importlib.import_module(new_module), new_attr),
715
- )
729
+ setattr(import_module(old_module), old_attr, getattr(import_module(new_module), new_attr))
716
730
 
717
731
  # Set modules in sys.modules under their old name
718
732
  for old, new in modules.items():
719
- sys.modules[old] = importlib.import_module(new)
733
+ sys.modules[old] = import_module(new)
720
734
 
721
735
  yield
722
736
  finally:
@@ -750,9 +764,10 @@ def torch_safe_load(weight):
750
764
  "ultralytics.yolo.data": "ultralytics.data",
751
765
  },
752
766
  attributes={
753
- "ultralytics.nn.modules.block.Silence": "torch.nn.Identity",
767
+ "ultralytics.nn.modules.block.Silence": "torch.nn.Identity", # YOLOv9e
768
+ "ultralytics.nn.tasks.YOLOv10DetectionModel": "ultralytics.nn.tasks.DetectionModel", # YOLOv10
754
769
  },
755
- ): # for legacy 8.0 Classify and Pose models
770
+ ):
756
771
  ckpt = torch.load(file, map_location="cpu")
757
772
 
758
773
  except ModuleNotFoundError as e: # e.name is missing module name
@@ -911,6 +926,9 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
911
926
  DWConvTranspose2d,
912
927
  C3x,
913
928
  RepC3,
929
+ PSA,
930
+ SCDown,
931
+ C2fCIB,
914
932
  }:
915
933
  c1, c2 = ch[f], args[0]
916
934
  if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output)
@@ -922,7 +940,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
922
940
  ) # num heads
923
941
 
924
942
  args = [c1, c2, *args[1:]]
925
- if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3}:
943
+ if m in {BottleneckCSP, C1, C2, C2f, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3, C2fCIB}:
926
944
  args.insert(2, n) # number of repeats
927
945
  n = 1
928
946
  elif m is AIFI:
@@ -939,7 +957,7 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
939
957
  args = [ch[f]]
940
958
  elif m is Concat:
941
959
  c2 = sum(ch[x] for x in f)
942
- elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn}:
960
+ elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn, v10Detect}:
943
961
  args.append([ch[x] for x in f])
944
962
  if m is Segment:
945
963
  args[2] = make_divisible(min(args[2], max_channels) * width, 8)
@@ -1024,7 +1042,7 @@ def guess_model_task(model):
1024
1042
  m = cfg["head"][-1][-2].lower() # output module name
1025
1043
  if m in {"classify", "classifier", "cls", "fc"}:
1026
1044
  return "classify"
1027
- if m == "detect":
1045
+ if "detect" in m:
1028
1046
  return "detect"
1029
1047
  if m == "segment":
1030
1048
  return "segment"
@@ -1056,7 +1074,7 @@ def guess_model_task(model):
1056
1074
  return "pose"
1057
1075
  elif isinstance(m, OBB):
1058
1076
  return "obb"
1059
- elif isinstance(m, (Detect, WorldDetect)):
1077
+ elif isinstance(m, (Detect, WorldDetect, v10Detect)):
1060
1078
  return "detect"
1061
1079
 
1062
1080
  # Guess from model filename
@@ -81,6 +81,7 @@ def benchmark(
81
81
  device = select_device(device, verbose=False)
82
82
  if isinstance(model, (str, Path)):
83
83
  model = YOLO(model)
84
+ is_end2end = getattr(model.model.model[-1], "end2end", False)
84
85
 
85
86
  y = []
86
87
  t0 = time.time()
@@ -96,14 +97,18 @@ def benchmark(
96
97
  assert MACOS or LINUX, "CoreML and TF.js export only supported on macOS and Linux"
97
98
  assert not IS_RASPBERRYPI, "CoreML and TF.js export not supported on Raspberry Pi"
98
99
  assert not IS_JETSON, "CoreML and TF.js export not supported on NVIDIA Jetson"
100
+ assert not is_end2end, "End-to-end models not supported by CoreML and TF.js yet"
99
101
  if i in {3, 5}: # CoreML and OpenVINO
100
102
  assert not IS_PYTHON_3_12, "CoreML and OpenVINO not supported on Python 3.12"
101
103
  if i in {6, 7, 8, 9, 10}: # All TF formats
102
104
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
105
+ assert not is_end2end, "End-to-end models not supported by onnx2tf yet"
103
106
  if i in {11}: # Paddle
104
107
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
108
+ assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
105
109
  if i in {12}: # NCNN
106
110
  assert not isinstance(model, YOLOWorld), "YOLOWorldv2 NCNN exports not supported yet"
111
+ assert not is_end2end, "End-to-end models not supported by NCNN yet"
107
112
  if "cpu" in device.type:
108
113
  assert cpu, "inference not supported on CPU"
109
114
  if "cuda" in device.type:
@@ -23,6 +23,7 @@ GITHUB_ASSETS_NAMES = (
23
23
  + [f"yolov8{k}-world.pt" for k in "smlx"]
24
24
  + [f"yolov8{k}-worldv2.pt" for k in "smlx"]
25
25
  + [f"yolov9{k}.pt" for k in "ce"]
26
+ + [f"yolov10{k}.pt" for k in "nsmblx"]
26
27
  + [f"yolo_nas_{k}.pt" for k in "sml"]
27
28
  + [f"sam_{k}.pt" for k in "bl"]
28
29
  + [f"FastSAM-{k}.pt" for k in "sx"]
ultralytics/utils/loss.py CHANGED
@@ -148,7 +148,7 @@ class KeypointLoss(nn.Module):
148
148
  class v8DetectionLoss:
149
149
  """Criterion class for computing training losses."""
150
150
 
151
- def __init__(self, model): # model must be de-paralleled
151
+ def __init__(self, model, tal_topk=10): # model must be de-paralleled
152
152
  """Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function."""
153
153
  device = next(model.parameters()).device # get model device
154
154
  h = model.args # hyperparameters
@@ -164,7 +164,7 @@ class v8DetectionLoss:
164
164
 
165
165
  self.use_dfl = m.reg_max > 1
166
166
 
167
- self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
167
+ self.assigner = TaskAlignedAssigner(topk=tal_topk, num_classes=self.nc, alpha=0.5, beta=6.0)
168
168
  self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device)
169
169
  self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device)
170
170
 
@@ -714,3 +714,21 @@ class v8OBBLoss(v8DetectionLoss):
714
714
  b, a, c = pred_dist.shape # batch, anchors, channels
715
715
  pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
716
716
  return torch.cat((dist2rbox(pred_dist, pred_angle, anchor_points), pred_angle), dim=-1)
717
+
718
+
719
+ class E2EDetectLoss:
720
+ """Criterion class for computing training losses."""
721
+
722
+ def __init__(self, model):
723
+ """Initialize E2EDetectLoss with one-to-many and one-to-one detection losses using the provided model."""
724
+ self.one2many = v8DetectionLoss(model, tal_topk=10)
725
+ self.one2one = v8DetectionLoss(model, tal_topk=1)
726
+
727
+ def __call__(self, preds, batch):
728
+ """Calculate the sum of the loss for box, cls and dfl multiplied by batch size."""
729
+ preds = preds[1] if isinstance(preds, tuple) else preds
730
+ one2many = preds["one2many"]
731
+ loss_one2many = self.one2many(one2many, batch)
732
+ one2one = preds["one2one"]
733
+ loss_one2one = self.one2one(one2one, batch)
734
+ return loss_one2many[0] + loss_one2one[0], loss_one2many[1] + loss_one2one[1]
@@ -64,8 +64,9 @@ def box_iou(box1, box2, eps=1e-7):
64
64
  (torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2.
65
65
  """
66
66
 
67
+ # NOTE: Need .float() to get accurate iou values
67
68
  # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
68
- (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2)
69
+ (a1, a2), (b1, b2) = box1.float().unsqueeze(1).chunk(2, 2), box2.float().unsqueeze(0).chunk(2, 2)
69
70
  inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2)
70
71
 
71
72
  # IoU = inter / (area1 + area2 - inter)
ultralytics/utils/ops.py CHANGED
@@ -213,6 +213,9 @@ def non_max_suppression(
213
213
  if isinstance(prediction, (list, tuple)): # YOLOv8 model in validation model, output = (inference_out, loss_out)
214
214
  prediction = prediction[0] # select only inference output
215
215
 
216
+ if prediction.shape[-1] == 6: # end-to-end model
217
+ return [pred[pred[:, 4] > conf_thres] for pred in prediction]
218
+
216
219
  bs = prediction.shape[0] # batch size
217
220
  nc = nc or (prediction.shape[1] - 4) # number of classes
218
221
  nm = prediction.shape[1] - nc - 4 # number of masks
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ultralytics
3
- Version: 8.2.37
3
+ Version: 8.2.38
4
4
  Summary: Ultralytics YOLOv8 for SOTA object detection, multi-object tracking, instance segmentation, pose estimation and image classification.
5
5
  Author: Glenn Jocher, Ayush Chaurasia, Jing Qiu
6
6
  Maintainer: Glenn Jocher, Ayush Chaurasia, Jing Qiu
@@ -6,8 +6,8 @@ tests/test_engine.py,sha256=fFzcbqZuMkzZHjA5FMddWcqVE703iq8HB_a0Q2lcBKM,4705
6
6
  tests/test_explorer.py,sha256=r1pWer2y290Y0DqsM-La7egfEY0497YCdC4rwq3URV4,2178
7
7
  tests/test_exports.py,sha256=qc4YOgsGixqYLO6IRNY16-v6z14R0dp5fdni1v222xw,8034
8
8
  tests/test_integrations.py,sha256=8Ru7GyKV8j44EEc8X9_E7q7aR4CTOIMPuSagXjSGUxw,5847
9
- tests/test_python.py,sha256=5cTM45P77LoOl-qixJ7TQmf66zw69adj01kNaaSxHqE,20265
10
- ultralytics/__init__.py,sha256=SZ2J0Bd3FrWlOh7a0GS_8EnhlKDLXT2cih66PzAHgfU,694
9
+ tests/test_python.py,sha256=9KjBKQXj6T9hRfX-4nnERd7OR3xx2ejV8430BoXjHro,20536
10
+ ultralytics/__init__.py,sha256=R8tdSpt8DjyJbiH1TGuPXSrbharV6KDLd3efp-aYEcA,694
11
11
  ultralytics/assets/bus.jpg,sha256=wCAZxJecGR63Od3ZRERe9Aja1Weayrb9Ug751DS_vGM,137419
12
12
  ultralytics/assets/zidane.jpg,sha256=Ftc4aeMmen1O0A3o6GCDO9FlfBslLpTAw0gnetx7bts,50427
13
13
  ultralytics/cfg/__init__.py,sha256=JblkT6Ze9MZ8hSs8gkV8JPcEKNMm-YqRqM4x501Dn9g,21507
@@ -43,6 +43,12 @@ ultralytics/cfg/models/rt-detr/rtdetr-l.yaml,sha256=Nbzi93tAJhBw69hUNBkzXaeMMWwW
43
43
  ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml,sha256=o0nWoKciT-vypC2eS5qIEWNSac0L6vwLtbK9ucQluG4,1512
44
44
  ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml,sha256=rb64WQK-3a_PebUcy6CbpskvlC74H9M3tMIr3R5vHDU,1510
45
45
  ultralytics/cfg/models/rt-detr/rtdetr-x.yaml,sha256=E5utqNL7oNztyPKySGPoVET8RIUeqAqchdaslu5Zb5g,2141
46
+ ultralytics/cfg/models/v10/yolov10b.yaml,sha256=GBN4p-I54eSvbFv4VpUavOY9uuUSv3wAnAXyvYZkE5w,1401
47
+ ultralytics/cfg/models/v10/yolov10l.yaml,sha256=vXbJXGj-rISV83doIKujlI5XjeD3PUyzSrNleSPns1g,1401
48
+ ultralytics/cfg/models/v10/yolov10m.yaml,sha256=VS915roEcpJDtVNtzH0OrJNM9FY2rCsz7zY0YU6v9gs,1392
49
+ ultralytics/cfg/models/v10/yolov10n.yaml,sha256=f7sJ49GL2IF5kXd9oh19W_cdUgbrFZLlp5jz6j-jO0M,1387
50
+ ultralytics/cfg/models/v10/yolov10s.yaml,sha256=WaOa5eAGiNEwPZsni01dlcLWyNkonZ4Tjvxxm7w0WFE,1396
51
+ ultralytics/cfg/models/v10/yolov10x.yaml,sha256=kMtkDJutUSTkw_aznpaoQ4YGUJpFTxoR1cxz31oqOKA,1404
46
52
  ultralytics/cfg/models/v3/yolov3-spp.yaml,sha256=NfKJeBpDgDSwXo7fSN8myQUQ68YLB9xRtqdBgGlVPHs,1525
47
53
  ultralytics/cfg/models/v3/yolov3-tiny.yaml,sha256=5mnGGCN-mNDvqvOz2AzGhfwEg01exzeHNPS3NA3poiY,1229
48
54
  ultralytics/cfg/models/v3/yolov3.yaml,sha256=-94p4tePdDtdpnz79u7O1sChV69kTi01lFxcVGoJ8MY,1512
@@ -91,7 +97,7 @@ ultralytics/data/explorer/utils.py,sha256=EvvukQiQUTBrsZznmMnyEX2EqTuwZo_Geyc8yf
91
97
  ultralytics/data/explorer/gui/__init__.py,sha256=mHtJuK4hwF8cuV-VHDc7tp6u6D1gHz2Z7JI8grmQDTs,42
92
98
  ultralytics/data/explorer/gui/dash.py,sha256=CPlFIIhf53j_YVAqealsC3AbcztdPqZxfniQcBnlKK4,10042
93
99
  ultralytics/engine/__init__.py,sha256=mHtJuK4hwF8cuV-VHDc7tp6u6D1gHz2Z7JI8grmQDTs,42
94
- ultralytics/engine/exporter.py,sha256=JWVmXMD8RpXOayisS2_Q4gSeqvKAeXfMt2Y-azOZiIo,58464
100
+ ultralytics/engine/exporter.py,sha256=0JUk5kMqr9argOqfXWP9WaxoVJSo5C4NMUIrPfEjni0,58534
95
101
  ultralytics/engine/model.py,sha256=qSvCT-l8mLT-CDixy6mjyC7N5x3edsWmobRWbojwLUM,40073
96
102
  ultralytics/engine/predictor.py,sha256=W58kDCFH2AfoFzpGbos3k8zUEVsLunBuM8sc2B64rPY,17449
97
103
  ultralytics/engine/results.py,sha256=zRuEIrBtpoCQ3M6a_YscnyXrWSP-zpL3ACv0gTdrDaw,30987
@@ -159,11 +165,11 @@ ultralytics/models/yolo/world/train.py,sha256=acYN2-onL69LrL4av6_hY2r5AY0urC0WVi
159
165
  ultralytics/models/yolo/world/train_world.py,sha256=n0XTAHYxufHU5OZ_QjpkHieKik-24z0LrYKzWYbCLvA,4798
160
166
  ultralytics/nn/__init__.py,sha256=4BPLHY89xEM_al5uK0aOmFgiML6CMGEZbezxOvTjOEs,587
161
167
  ultralytics/nn/autobackend.py,sha256=zsMF-GS12xtMBeQEkSoJ5cudEHyzMaRSQBuXcfuBNdo,31210
162
- ultralytics/nn/tasks.py,sha256=extgDOPk2wHFxjiyOMotM68AqeGzNrMwehEdi5lX0JE,44954
163
- ultralytics/nn/modules/__init__.py,sha256=JPj_TloK33DdxS8gvA8Pcet5ax1SgbRcb5mTTOS0DCI,2371
164
- ultralytics/nn/modules/block.py,sha256=3SfxkNMBKbjzAzNrt_CeGxpeBLkrdko7n07cDSIY6gg,25781
168
+ ultralytics/nn/tasks.py,sha256=R3zOAzW3lyC0vAsNFagx49hPKKHQxt8MTMkEhJ_0AZI,45447
169
+ ultralytics/nn/modules/__init__.py,sha256=9rGYw0c_XjBVs7rwj1RiM4U58_TlN4i2Ufl8hoL42J0,2536
170
+ ultralytics/nn/modules/block.py,sha256=s7bbxc4aINGBdxuvIcUbknzmWTmBwui5W0T9diie0D4,34403
165
171
  ultralytics/nn/modules/conv.py,sha256=Ywe87IhuaS22mR2JJ9xjnW8Sb-m7WTjxuqIxV_Dv8lI,12722
166
- ultralytics/nn/modules/head.py,sha256=3N_4zW1UvhI1jCrIxIkNYxQDdiW6HxtxpaNAAudq6NU,22236
172
+ ultralytics/nn/modules/head.py,sha256=hR-_hRMZMizl5Ttnx_FEzy8T3_58PJRLreeIzw2TVE4,26761
167
173
  ultralytics/nn/modules/transformer.py,sha256=AxD9uURpCl-EqvXe3DiG6JW-pBzB16G-AahLdZ7yayo,17909
168
174
  ultralytics/nn/modules/utils.py,sha256=779QnnKp9v8jv251ESduTXJ0ol8HkIOLbGQWwEGQjhU,3196
169
175
  ultralytics/solutions/__init__.py,sha256=S4m7p_rpg2pk9PdnqqD-6Sk--wDHxZSo7cUZjSwj_iQ,561
@@ -186,16 +192,16 @@ ultralytics/trackers/utils/kalman_filter.py,sha256=0oqhk59NKEiwcJ2FXnw6_sT4bIFC6
186
192
  ultralytics/trackers/utils/matching.py,sha256=UxhSGa5pN6WoYwYSBAkkt-O7xMxUR47VuUB6PfVNkb4,5404
187
193
  ultralytics/utils/__init__.py,sha256=jrPWtLQEZJtbumqRrctgUikpAzS62Xm0iPy73iqIGSs,38640
188
194
  ultralytics/utils/autobatch.py,sha256=gPFcREMsMHRAuTQiBnNZ9Mm1XNqmQW-uMPhveDFEQ_Y,3966
189
- ultralytics/utils/benchmarks.py,sha256=tBVe5Q4HZABpjpI1LDqpT8bJSoZFhsAEtyZCHx8dMIg,23120
195
+ ultralytics/utils/benchmarks.py,sha256=tDX7wu0TpMMlEQDOFqfkjxl156ssS7Lh_5tFWIXdJfg,23549
190
196
  ultralytics/utils/checks.py,sha256=PDY1eHlsyDVEIiKRjvb81uz2jniL1MqgP_TmXH_78KM,28379
191
197
  ultralytics/utils/dist.py,sha256=3HeNbY2gp7vYhcvVhsrvTrQXpQmgT8tpmnzApf3eQRA,2267
192
- ultralytics/utils/downloads.py,sha256=cmO2Ev1DV1m_lYgQ2yGDG5xVRIBVS_z9nS_Frec_NeU,21496
198
+ ultralytics/utils/downloads.py,sha256=AcO0vT4jZd3BJz4dhYYci8PKWJxlqAGraqo_IlU2kYE,21539
193
199
  ultralytics/utils/errors.py,sha256=GqP_Jgj_n0paxn8OMhn3DTCgoNkB2WjUcUaqs-M6SQk,816
194
200
  ultralytics/utils/files.py,sha256=TVfY0Wi5IsUc4YdsDzC0dAg-jAP5exYvwqB3VmXhDLY,6761
195
201
  ultralytics/utils/instance.py,sha256=5daM5nkxBv9hr5QzyII8zmuFj24hHuNtcr4EMCHAtpY,15654
196
- ultralytics/utils/loss.py,sha256=ejXnPEIAzNEoNz2UjW0_fcdeUs9Hy-jPzUrJ3FiIIwE,32717
197
- ultralytics/utils/metrics.py,sha256=XPD-xP0fchR8KgCuTcihV2-n0EK1cWi3-53BWN_pLuA,53518
198
- ultralytics/utils/ops.py,sha256=J9wbb9aTW9aaI5DJRqA72BZAX77cmVyCJdnGuwkDu-k,33089
202
+ ultralytics/utils/loss.py,sha256=RF0st6IPW5pFhUMYHXCQ9msNJbPPeD8dRdQDn6HwZN8,33539
203
+ ultralytics/utils/metrics.py,sha256=3nuFZK_7rnhf6KjhflnRfHVN2i_ZB-LbGvIdbc177N8,53587
204
+ ultralytics/utils/ops.py,sha256=A6MnypWNEpgOQRJpPwE3JMi2rUQWaDmBklIaaqvu3Lc,33214
199
205
  ultralytics/utils/patches.py,sha256=SgMqeMsq2K6JoBJP1NplXMl9C6rK0JeJUChjBrJOneo,2750
200
206
  ultralytics/utils/plotting.py,sha256=I3YYLSsmj1BX8S5DphsedAm0RfisrPbeLpyuzsKXbqY,53288
201
207
  ultralytics/utils/tal.py,sha256=xuIyryUjaaYHkHPG9GvBwh1xxN2Hq4y3hXOtuERehwY,16017
@@ -213,9 +219,9 @@ ultralytics/utils/callbacks/neptune.py,sha256=5Z3ua5YBTUS56FH8VQKQG1aaIo9fH8GEyz
213
219
  ultralytics/utils/callbacks/raytune.py,sha256=ODVYzy-CoM4Uge0zjkh3Hnh9nF2M0vhDrSenXnvcizw,705
214
220
  ultralytics/utils/callbacks/tensorboard.py,sha256=QEgOVhUqY9akOs5TJIwz1Rvn6l32xWLpOxlwEyWF0B8,4136
215
221
  ultralytics/utils/callbacks/wb.py,sha256=9-fjQIdLjr3b73DTE3rHO171KvbH1VweJ-bmbv-rqTw,6747
216
- ultralytics-8.2.37.dist-info/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
217
- ultralytics-8.2.37.dist-info/METADATA,sha256=LHuqk6NTu__ZhHOS1G0EldVE8hSCUtsXdmGtp55pHRQ,41316
218
- ultralytics-8.2.37.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
219
- ultralytics-8.2.37.dist-info/entry_points.txt,sha256=YM_wiKyTe9yRrsEfqvYolNO5ngwfoL4-NwgKzc8_7sI,93
220
- ultralytics-8.2.37.dist-info/top_level.txt,sha256=XP49TwiMw4QGsvTLSYiJhz1xF_k7ev5mQ8jJXaXi45Q,12
221
- ultralytics-8.2.37.dist-info/RECORD,,
222
+ ultralytics-8.2.38.dist-info/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
223
+ ultralytics-8.2.38.dist-info/METADATA,sha256=mcUpRkBNjCoRy3p0szqbea_wxZ2pxCsAIykwsyflrV8,41316
224
+ ultralytics-8.2.38.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
225
+ ultralytics-8.2.38.dist-info/entry_points.txt,sha256=YM_wiKyTe9yRrsEfqvYolNO5ngwfoL4-NwgKzc8_7sI,93
226
+ ultralytics-8.2.38.dist-info/top_level.txt,sha256=XP49TwiMw4QGsvTLSYiJhz1xF_k7ev5mQ8jJXaXi45Q,12
227
+ ultralytics-8.2.38.dist-info/RECORD,,