dgenerate-ultralytics-headless 8.3.214__py3-none-any.whl → 8.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/METADATA +64 -74
  2. dgenerate_ultralytics_headless-8.4.7.dist-info/RECORD +311 -0
  3. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/WHEEL +1 -1
  4. tests/__init__.py +7 -9
  5. tests/conftest.py +8 -15
  6. tests/test_cli.py +1 -1
  7. tests/test_cuda.py +13 -10
  8. tests/test_engine.py +9 -9
  9. tests/test_exports.py +65 -13
  10. tests/test_integrations.py +13 -13
  11. tests/test_python.py +125 -69
  12. tests/test_solutions.py +161 -152
  13. ultralytics/__init__.py +1 -1
  14. ultralytics/cfg/__init__.py +86 -92
  15. ultralytics/cfg/datasets/Argoverse.yaml +7 -6
  16. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  17. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  18. ultralytics/cfg/datasets/ImageNet.yaml +1 -1
  19. ultralytics/cfg/datasets/TT100K.yaml +346 -0
  20. ultralytics/cfg/datasets/VOC.yaml +15 -16
  21. ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
  22. ultralytics/cfg/datasets/coco-pose.yaml +21 -0
  23. ultralytics/cfg/datasets/coco12-formats.yaml +101 -0
  24. ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
  25. ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
  26. ultralytics/cfg/datasets/dog-pose.yaml +28 -0
  27. ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
  28. ultralytics/cfg/datasets/dota8.yaml +2 -2
  29. ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
  30. ultralytics/cfg/datasets/kitti.yaml +27 -0
  31. ultralytics/cfg/datasets/lvis.yaml +5 -5
  32. ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
  33. ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
  34. ultralytics/cfg/datasets/xView.yaml +16 -16
  35. ultralytics/cfg/default.yaml +4 -2
  36. ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
  37. ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
  38. ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
  39. ultralytics/cfg/models/26/yolo26-cls.yaml +33 -0
  40. ultralytics/cfg/models/26/yolo26-obb.yaml +52 -0
  41. ultralytics/cfg/models/26/yolo26-p2.yaml +60 -0
  42. ultralytics/cfg/models/26/yolo26-p6.yaml +62 -0
  43. ultralytics/cfg/models/26/yolo26-pose.yaml +53 -0
  44. ultralytics/cfg/models/26/yolo26-seg.yaml +52 -0
  45. ultralytics/cfg/models/26/yolo26.yaml +52 -0
  46. ultralytics/cfg/models/26/yoloe-26-seg.yaml +53 -0
  47. ultralytics/cfg/models/26/yoloe-26.yaml +53 -0
  48. ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
  49. ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
  50. ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
  51. ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
  52. ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
  53. ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
  54. ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
  55. ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
  56. ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
  57. ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
  58. ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
  59. ultralytics/cfg/models/v6/yolov6.yaml +1 -1
  60. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
  61. ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
  62. ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
  63. ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
  64. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
  65. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
  66. ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
  67. ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
  68. ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
  69. ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
  70. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
  71. ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
  72. ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
  73. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
  74. ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
  75. ultralytics/data/__init__.py +4 -4
  76. ultralytics/data/annotator.py +5 -6
  77. ultralytics/data/augment.py +300 -475
  78. ultralytics/data/base.py +18 -26
  79. ultralytics/data/build.py +147 -25
  80. ultralytics/data/converter.py +108 -87
  81. ultralytics/data/dataset.py +47 -75
  82. ultralytics/data/loaders.py +42 -49
  83. ultralytics/data/split.py +5 -6
  84. ultralytics/data/split_dota.py +8 -15
  85. ultralytics/data/utils.py +36 -45
  86. ultralytics/engine/exporter.py +351 -263
  87. ultralytics/engine/model.py +186 -225
  88. ultralytics/engine/predictor.py +45 -54
  89. ultralytics/engine/results.py +198 -325
  90. ultralytics/engine/trainer.py +165 -106
  91. ultralytics/engine/tuner.py +41 -43
  92. ultralytics/engine/validator.py +55 -38
  93. ultralytics/hub/__init__.py +16 -19
  94. ultralytics/hub/auth.py +6 -12
  95. ultralytics/hub/google/__init__.py +7 -10
  96. ultralytics/hub/session.py +15 -25
  97. ultralytics/hub/utils.py +5 -8
  98. ultralytics/models/__init__.py +1 -1
  99. ultralytics/models/fastsam/__init__.py +1 -1
  100. ultralytics/models/fastsam/model.py +8 -10
  101. ultralytics/models/fastsam/predict.py +18 -30
  102. ultralytics/models/fastsam/utils.py +1 -2
  103. ultralytics/models/fastsam/val.py +5 -7
  104. ultralytics/models/nas/__init__.py +1 -1
  105. ultralytics/models/nas/model.py +5 -8
  106. ultralytics/models/nas/predict.py +7 -9
  107. ultralytics/models/nas/val.py +1 -2
  108. ultralytics/models/rtdetr/__init__.py +1 -1
  109. ultralytics/models/rtdetr/model.py +5 -8
  110. ultralytics/models/rtdetr/predict.py +15 -19
  111. ultralytics/models/rtdetr/train.py +10 -13
  112. ultralytics/models/rtdetr/val.py +21 -23
  113. ultralytics/models/sam/__init__.py +15 -2
  114. ultralytics/models/sam/amg.py +14 -20
  115. ultralytics/models/sam/build.py +26 -19
  116. ultralytics/models/sam/build_sam3.py +377 -0
  117. ultralytics/models/sam/model.py +29 -32
  118. ultralytics/models/sam/modules/blocks.py +83 -144
  119. ultralytics/models/sam/modules/decoders.py +19 -37
  120. ultralytics/models/sam/modules/encoders.py +44 -101
  121. ultralytics/models/sam/modules/memory_attention.py +16 -30
  122. ultralytics/models/sam/modules/sam.py +200 -73
  123. ultralytics/models/sam/modules/tiny_encoder.py +64 -83
  124. ultralytics/models/sam/modules/transformer.py +18 -28
  125. ultralytics/models/sam/modules/utils.py +174 -50
  126. ultralytics/models/sam/predict.py +2248 -350
  127. ultralytics/models/sam/sam3/__init__.py +3 -0
  128. ultralytics/models/sam/sam3/decoder.py +546 -0
  129. ultralytics/models/sam/sam3/encoder.py +529 -0
  130. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  131. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  132. ultralytics/models/sam/sam3/model_misc.py +199 -0
  133. ultralytics/models/sam/sam3/necks.py +129 -0
  134. ultralytics/models/sam/sam3/sam3_image.py +339 -0
  135. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  136. ultralytics/models/sam/sam3/vitdet.py +547 -0
  137. ultralytics/models/sam/sam3/vl_combiner.py +160 -0
  138. ultralytics/models/utils/loss.py +14 -26
  139. ultralytics/models/utils/ops.py +13 -17
  140. ultralytics/models/yolo/__init__.py +1 -1
  141. ultralytics/models/yolo/classify/predict.py +10 -13
  142. ultralytics/models/yolo/classify/train.py +12 -33
  143. ultralytics/models/yolo/classify/val.py +30 -29
  144. ultralytics/models/yolo/detect/predict.py +9 -12
  145. ultralytics/models/yolo/detect/train.py +17 -23
  146. ultralytics/models/yolo/detect/val.py +77 -59
  147. ultralytics/models/yolo/model.py +43 -60
  148. ultralytics/models/yolo/obb/predict.py +7 -16
  149. ultralytics/models/yolo/obb/train.py +14 -17
  150. ultralytics/models/yolo/obb/val.py +40 -37
  151. ultralytics/models/yolo/pose/__init__.py +1 -1
  152. ultralytics/models/yolo/pose/predict.py +7 -22
  153. ultralytics/models/yolo/pose/train.py +13 -16
  154. ultralytics/models/yolo/pose/val.py +39 -58
  155. ultralytics/models/yolo/segment/predict.py +17 -21
  156. ultralytics/models/yolo/segment/train.py +7 -10
  157. ultralytics/models/yolo/segment/val.py +95 -47
  158. ultralytics/models/yolo/world/train.py +8 -14
  159. ultralytics/models/yolo/world/train_world.py +11 -34
  160. ultralytics/models/yolo/yoloe/__init__.py +7 -7
  161. ultralytics/models/yolo/yoloe/predict.py +16 -23
  162. ultralytics/models/yolo/yoloe/train.py +36 -44
  163. ultralytics/models/yolo/yoloe/train_seg.py +11 -11
  164. ultralytics/models/yolo/yoloe/val.py +15 -20
  165. ultralytics/nn/__init__.py +7 -7
  166. ultralytics/nn/autobackend.py +159 -85
  167. ultralytics/nn/modules/__init__.py +68 -60
  168. ultralytics/nn/modules/activation.py +4 -6
  169. ultralytics/nn/modules/block.py +260 -224
  170. ultralytics/nn/modules/conv.py +52 -97
  171. ultralytics/nn/modules/head.py +831 -299
  172. ultralytics/nn/modules/transformer.py +76 -88
  173. ultralytics/nn/modules/utils.py +16 -21
  174. ultralytics/nn/tasks.py +180 -195
  175. ultralytics/nn/text_model.py +45 -69
  176. ultralytics/optim/__init__.py +5 -0
  177. ultralytics/optim/muon.py +338 -0
  178. ultralytics/solutions/__init__.py +12 -12
  179. ultralytics/solutions/ai_gym.py +13 -19
  180. ultralytics/solutions/analytics.py +15 -16
  181. ultralytics/solutions/config.py +6 -7
  182. ultralytics/solutions/distance_calculation.py +10 -13
  183. ultralytics/solutions/heatmap.py +8 -14
  184. ultralytics/solutions/instance_segmentation.py +6 -9
  185. ultralytics/solutions/object_blurrer.py +7 -10
  186. ultralytics/solutions/object_counter.py +12 -19
  187. ultralytics/solutions/object_cropper.py +8 -14
  188. ultralytics/solutions/parking_management.py +34 -32
  189. ultralytics/solutions/queue_management.py +10 -12
  190. ultralytics/solutions/region_counter.py +9 -12
  191. ultralytics/solutions/security_alarm.py +15 -20
  192. ultralytics/solutions/similarity_search.py +10 -15
  193. ultralytics/solutions/solutions.py +77 -76
  194. ultralytics/solutions/speed_estimation.py +7 -10
  195. ultralytics/solutions/streamlit_inference.py +2 -4
  196. ultralytics/solutions/templates/similarity-search.html +7 -18
  197. ultralytics/solutions/trackzone.py +7 -10
  198. ultralytics/solutions/vision_eye.py +5 -8
  199. ultralytics/trackers/__init__.py +1 -1
  200. ultralytics/trackers/basetrack.py +3 -5
  201. ultralytics/trackers/bot_sort.py +10 -27
  202. ultralytics/trackers/byte_tracker.py +21 -37
  203. ultralytics/trackers/track.py +4 -7
  204. ultralytics/trackers/utils/gmc.py +11 -22
  205. ultralytics/trackers/utils/kalman_filter.py +37 -48
  206. ultralytics/trackers/utils/matching.py +12 -15
  207. ultralytics/utils/__init__.py +124 -124
  208. ultralytics/utils/autobatch.py +2 -4
  209. ultralytics/utils/autodevice.py +17 -18
  210. ultralytics/utils/benchmarks.py +57 -71
  211. ultralytics/utils/callbacks/base.py +8 -10
  212. ultralytics/utils/callbacks/clearml.py +5 -13
  213. ultralytics/utils/callbacks/comet.py +32 -46
  214. ultralytics/utils/callbacks/dvc.py +13 -18
  215. ultralytics/utils/callbacks/mlflow.py +4 -5
  216. ultralytics/utils/callbacks/neptune.py +7 -15
  217. ultralytics/utils/callbacks/platform.py +423 -38
  218. ultralytics/utils/callbacks/raytune.py +3 -4
  219. ultralytics/utils/callbacks/tensorboard.py +25 -31
  220. ultralytics/utils/callbacks/wb.py +16 -14
  221. ultralytics/utils/checks.py +127 -85
  222. ultralytics/utils/cpu.py +3 -8
  223. ultralytics/utils/dist.py +9 -12
  224. ultralytics/utils/downloads.py +25 -33
  225. ultralytics/utils/errors.py +6 -14
  226. ultralytics/utils/events.py +2 -4
  227. ultralytics/utils/export/__init__.py +4 -236
  228. ultralytics/utils/export/engine.py +246 -0
  229. ultralytics/utils/export/imx.py +117 -63
  230. ultralytics/utils/export/tensorflow.py +231 -0
  231. ultralytics/utils/files.py +26 -30
  232. ultralytics/utils/git.py +9 -11
  233. ultralytics/utils/instance.py +30 -51
  234. ultralytics/utils/logger.py +212 -114
  235. ultralytics/utils/loss.py +601 -215
  236. ultralytics/utils/metrics.py +128 -156
  237. ultralytics/utils/nms.py +13 -16
  238. ultralytics/utils/ops.py +117 -166
  239. ultralytics/utils/patches.py +75 -21
  240. ultralytics/utils/plotting.py +75 -80
  241. ultralytics/utils/tal.py +125 -59
  242. ultralytics/utils/torch_utils.py +53 -79
  243. ultralytics/utils/tqdm.py +24 -21
  244. ultralytics/utils/triton.py +13 -19
  245. ultralytics/utils/tuner.py +19 -10
  246. dgenerate_ultralytics_headless-8.3.214.dist-info/RECORD +0 -283
  247. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/entry_points.txt +0 -0
  248. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/licenses/LICENSE +0 -0
  249. {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/top_level.txt +0 -0
@@ -13,58 +13,56 @@ from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
13
13
  from .transformer import TransformerBlock
14
14
 
15
15
  __all__ = (
16
- "DFL",
17
- "HGBlock",
18
- "HGStem",
19
- "SPP",
20
- "SPPF",
21
16
  "C1",
22
17
  "C2",
18
+ "C2PSA",
23
19
  "C3",
20
+ "C3TR",
21
+ "CIB",
22
+ "DFL",
23
+ "ELAN1",
24
+ "PSA",
25
+ "SPP",
26
+ "SPPELAN",
27
+ "SPPF",
28
+ "AConv",
29
+ "ADown",
30
+ "Attention",
31
+ "BNContrastiveHead",
32
+ "Bottleneck",
33
+ "BottleneckCSP",
24
34
  "C2f",
25
35
  "C2fAttn",
26
- "ImagePoolingAttn",
27
- "ContrastiveHead",
28
- "BNContrastiveHead",
29
- "C3x",
30
- "C3TR",
36
+ "C2fCIB",
37
+ "C2fPSA",
31
38
  "C3Ghost",
39
+ "C3k2",
40
+ "C3x",
41
+ "CBFuse",
42
+ "CBLinear",
43
+ "ContrastiveHead",
32
44
  "GhostBottleneck",
33
- "Bottleneck",
34
- "BottleneckCSP",
45
+ "HGBlock",
46
+ "HGStem",
47
+ "ImagePoolingAttn",
35
48
  "Proto",
36
49
  "RepC3",
37
- "ResNetLayer",
38
50
  "RepNCSPELAN4",
39
- "ELAN1",
40
- "ADown",
41
- "AConv",
42
- "SPPELAN",
43
- "CBFuse",
44
- "CBLinear",
45
- "C3k2",
46
- "C2fPSA",
47
- "C2PSA",
48
51
  "RepVGGDW",
49
- "CIB",
50
- "C2fCIB",
51
- "Attention",
52
- "PSA",
52
+ "ResNetLayer",
53
53
  "SCDown",
54
54
  "TorchVision",
55
55
  )
56
56
 
57
57
 
58
58
  class DFL(nn.Module):
59
- """
60
- Integral module of Distribution Focal Loss (DFL).
59
+ """Integral module of Distribution Focal Loss (DFL).
61
60
 
62
61
  Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
63
62
  """
64
63
 
65
64
  def __init__(self, c1: int = 16):
66
- """
67
- Initialize a convolutional layer with a given number of input channels.
65
+ """Initialize a convolutional layer with a given number of input channels.
68
66
 
69
67
  Args:
70
68
  c1 (int): Number of input channels.
@@ -86,8 +84,7 @@ class Proto(nn.Module):
86
84
  """Ultralytics YOLO models mask Proto module for segmentation models."""
87
85
 
88
86
  def __init__(self, c1: int, c_: int = 256, c2: int = 32):
89
- """
90
- Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.
87
+ """Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.
91
88
 
92
89
  Args:
93
90
  c1 (int): Input channels.
@@ -106,15 +103,13 @@ class Proto(nn.Module):
106
103
 
107
104
 
108
105
  class HGStem(nn.Module):
109
- """
110
- StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
106
+ """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
111
107
 
112
108
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
113
109
  """
114
110
 
115
111
  def __init__(self, c1: int, cm: int, c2: int):
116
- """
117
- Initialize the StemBlock of PPHGNetV2.
112
+ """Initialize the StemBlock of PPHGNetV2.
118
113
 
119
114
  Args:
120
115
  c1 (int): Input channels.
@@ -144,8 +139,7 @@ class HGStem(nn.Module):
144
139
 
145
140
 
146
141
  class HGBlock(nn.Module):
147
- """
148
- HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
142
+ """HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
149
143
 
150
144
  https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
151
145
  """
@@ -161,8 +155,7 @@ class HGBlock(nn.Module):
161
155
  shortcut: bool = False,
162
156
  act: nn.Module = nn.ReLU(),
163
157
  ):
164
- """
165
- Initialize HGBlock with specified parameters.
158
+ """Initialize HGBlock with specified parameters.
166
159
 
167
160
  Args:
168
161
  c1 (int): Input channels.
@@ -193,8 +186,7 @@ class SPP(nn.Module):
193
186
  """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
194
187
 
195
188
  def __init__(self, c1: int, c2: int, k: tuple[int, ...] = (5, 9, 13)):
196
- """
197
- Initialize the SPP layer with input/output channels and pooling kernel sizes.
189
+ """Initialize the SPP layer with input/output channels and pooling kernel sizes.
198
190
 
199
191
  Args:
200
192
  c1 (int): Input channels.
@@ -216,37 +208,40 @@ class SPP(nn.Module):
216
208
  class SPPF(nn.Module):
217
209
  """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
218
210
 
219
- def __init__(self, c1: int, c2: int, k: int = 5):
220
- """
221
- Initialize the SPPF layer with given input/output channels and kernel size.
211
+ def __init__(self, c1: int, c2: int, k: int = 5, n: int = 3, shortcut: bool = False):
212
+ """Initialize the SPPF layer with given input/output channels and kernel size.
222
213
 
223
214
  Args:
224
215
  c1 (int): Input channels.
225
216
  c2 (int): Output channels.
226
217
  k (int): Kernel size.
218
+ n (int): Number of pooling iterations.
219
+ shortcut (bool): Whether to use shortcut connection.
227
220
 
228
221
  Notes:
229
222
  This module is equivalent to SPP(k=(5, 9, 13)).
230
223
  """
231
224
  super().__init__()
232
225
  c_ = c1 // 2 # hidden channels
233
- self.cv1 = Conv(c1, c_, 1, 1)
234
- self.cv2 = Conv(c_ * 4, c2, 1, 1)
226
+ self.cv1 = Conv(c1, c_, 1, 1, act=False)
227
+ self.cv2 = Conv(c_ * (n + 1), c2, 1, 1)
235
228
  self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
229
+ self.n = n
230
+ self.add = shortcut and c1 == c2
236
231
 
237
232
  def forward(self, x: torch.Tensor) -> torch.Tensor:
238
233
  """Apply sequential pooling operations to input and return concatenated feature maps."""
239
234
  y = [self.cv1(x)]
240
- y.extend(self.m(y[-1]) for _ in range(3))
241
- return self.cv2(torch.cat(y, 1))
235
+ y.extend(self.m(y[-1]) for _ in range(getattr(self, "n", 3)))
236
+ y = self.cv2(torch.cat(y, 1))
237
+ return y + x if getattr(self, "add", False) else y
242
238
 
243
239
 
244
240
  class C1(nn.Module):
245
241
  """CSP Bottleneck with 1 convolution."""
246
242
 
247
243
  def __init__(self, c1: int, c2: int, n: int = 1):
248
- """
249
- Initialize the CSP Bottleneck with 1 convolution.
244
+ """Initialize the CSP Bottleneck with 1 convolution.
250
245
 
251
246
  Args:
252
247
  c1 (int): Input channels.
@@ -267,8 +262,7 @@ class C2(nn.Module):
267
262
  """CSP Bottleneck with 2 convolutions."""
268
263
 
269
264
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
270
- """
271
- Initialize a CSP Bottleneck with 2 convolutions.
265
+ """Initialize a CSP Bottleneck with 2 convolutions.
272
266
 
273
267
  Args:
274
268
  c1 (int): Input channels.
@@ -295,8 +289,7 @@ class C2f(nn.Module):
295
289
  """Faster Implementation of CSP Bottleneck with 2 convolutions."""
296
290
 
297
291
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
298
- """
299
- Initialize a CSP bottleneck with 2 convolutions.
292
+ """Initialize a CSP bottleneck with 2 convolutions.
300
293
 
301
294
  Args:
302
295
  c1 (int): Input channels.
@@ -330,8 +323,7 @@ class C3(nn.Module):
330
323
  """CSP Bottleneck with 3 convolutions."""
331
324
 
332
325
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
333
- """
334
- Initialize the CSP Bottleneck with 3 convolutions.
326
+ """Initialize the CSP Bottleneck with 3 convolutions.
335
327
 
336
328
  Args:
337
329
  c1 (int): Input channels.
@@ -357,8 +349,7 @@ class C3x(C3):
357
349
  """C3 module with cross-convolutions."""
358
350
 
359
351
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
360
- """
361
- Initialize C3 module with cross-convolutions.
352
+ """Initialize C3 module with cross-convolutions.
362
353
 
363
354
  Args:
364
355
  c1 (int): Input channels.
@@ -377,8 +368,7 @@ class RepC3(nn.Module):
377
368
  """Rep C3."""
378
369
 
379
370
  def __init__(self, c1: int, c2: int, n: int = 3, e: float = 1.0):
380
- """
381
- Initialize CSP Bottleneck with a single convolution.
371
+ """Initialize CSP Bottleneck with a single convolution.
382
372
 
383
373
  Args:
384
374
  c1 (int): Input channels.
@@ -402,8 +392,7 @@ class C3TR(C3):
402
392
  """C3 module with TransformerBlock()."""
403
393
 
404
394
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
405
- """
406
- Initialize C3 module with TransformerBlock.
395
+ """Initialize C3 module with TransformerBlock.
407
396
 
408
397
  Args:
409
398
  c1 (int): Input channels.
@@ -422,8 +411,7 @@ class C3Ghost(C3):
422
411
  """C3 module with GhostBottleneck()."""
423
412
 
424
413
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
425
- """
426
- Initialize C3 module with GhostBottleneck.
414
+ """Initialize C3 module with GhostBottleneck.
427
415
 
428
416
  Args:
429
417
  c1 (int): Input channels.
@@ -442,8 +430,7 @@ class GhostBottleneck(nn.Module):
442
430
  """Ghost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones."""
443
431
 
444
432
  def __init__(self, c1: int, c2: int, k: int = 3, s: int = 1):
445
- """
446
- Initialize Ghost Bottleneck module.
433
+ """Initialize Ghost Bottleneck module.
447
434
 
448
435
  Args:
449
436
  c1 (int): Input channels.
@@ -473,8 +460,7 @@ class Bottleneck(nn.Module):
473
460
  def __init__(
474
461
  self, c1: int, c2: int, shortcut: bool = True, g: int = 1, k: tuple[int, int] = (3, 3), e: float = 0.5
475
462
  ):
476
- """
477
- Initialize a standard bottleneck module.
463
+ """Initialize a standard bottleneck module.
478
464
 
479
465
  Args:
480
466
  c1 (int): Input channels.
@@ -499,8 +485,7 @@ class BottleneckCSP(nn.Module):
499
485
  """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
500
486
 
501
487
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
502
- """
503
- Initialize CSP Bottleneck.
488
+ """Initialize CSP Bottleneck.
504
489
 
505
490
  Args:
506
491
  c1 (int): Input channels.
@@ -531,8 +516,7 @@ class ResNetBlock(nn.Module):
531
516
  """ResNet block with standard convolution layers."""
532
517
 
533
518
  def __init__(self, c1: int, c2: int, s: int = 1, e: int = 4):
534
- """
535
- Initialize ResNet block.
519
+ """Initialize ResNet block.
536
520
 
537
521
  Args:
538
522
  c1 (int): Input channels.
@@ -556,8 +540,7 @@ class ResNetLayer(nn.Module):
556
540
  """ResNet layer with multiple ResNet blocks."""
557
541
 
558
542
  def __init__(self, c1: int, c2: int, s: int = 1, is_first: bool = False, n: int = 1, e: int = 4):
559
- """
560
- Initialize ResNet layer.
543
+ """Initialize ResNet layer.
561
544
 
562
545
  Args:
563
546
  c1 (int): Input channels.
@@ -588,8 +571,7 @@ class MaxSigmoidAttnBlock(nn.Module):
588
571
  """Max Sigmoid attention block."""
589
572
 
590
573
  def __init__(self, c1: int, c2: int, nh: int = 1, ec: int = 128, gc: int = 512, scale: bool = False):
591
- """
592
- Initialize MaxSigmoidAttnBlock.
574
+ """Initialize MaxSigmoidAttnBlock.
593
575
 
594
576
  Args:
595
577
  c1 (int): Input channels.
@@ -609,8 +591,7 @@ class MaxSigmoidAttnBlock(nn.Module):
609
591
  self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
610
592
 
611
593
  def forward(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
612
- """
613
- Forward pass of MaxSigmoidAttnBlock.
594
+ """Forward pass of MaxSigmoidAttnBlock.
614
595
 
615
596
  Args:
616
597
  x (torch.Tensor): Input tensor.
@@ -653,8 +634,7 @@ class C2fAttn(nn.Module):
653
634
  g: int = 1,
654
635
  e: float = 0.5,
655
636
  ):
656
- """
657
- Initialize C2f module with attention mechanism.
637
+ """Initialize C2f module with attention mechanism.
658
638
 
659
639
  Args:
660
640
  c1 (int): Input channels.
@@ -675,8 +655,7 @@ class C2fAttn(nn.Module):
675
655
  self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
676
656
 
677
657
  def forward(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
678
- """
679
- Forward pass through C2f layer with attention.
658
+ """Forward pass through C2f layer with attention.
680
659
 
681
660
  Args:
682
661
  x (torch.Tensor): Input tensor.
@@ -691,8 +670,7 @@ class C2fAttn(nn.Module):
691
670
  return self.cv2(torch.cat(y, 1))
692
671
 
693
672
  def forward_split(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
694
- """
695
- Forward pass using split() instead of chunk().
673
+ """Forward pass using split() instead of chunk().
696
674
 
697
675
  Args:
698
676
  x (torch.Tensor): Input tensor.
@@ -713,8 +691,7 @@ class ImagePoolingAttn(nn.Module):
713
691
  def __init__(
714
692
  self, ec: int = 256, ch: tuple[int, ...] = (), ct: int = 512, nh: int = 8, k: int = 3, scale: bool = False
715
693
  ):
716
- """
717
- Initialize ImagePoolingAttn module.
694
+ """Initialize ImagePoolingAttn module.
718
695
 
719
696
  Args:
720
697
  ec (int): Embedding channels.
@@ -741,8 +718,7 @@ class ImagePoolingAttn(nn.Module):
741
718
  self.k = k
742
719
 
743
720
  def forward(self, x: list[torch.Tensor], text: torch.Tensor) -> torch.Tensor:
744
- """
745
- Forward pass of ImagePoolingAttn.
721
+ """Forward pass of ImagePoolingAttn.
746
722
 
747
723
  Args:
748
724
  x (list[torch.Tensor]): List of input feature maps.
@@ -785,8 +761,7 @@ class ContrastiveHead(nn.Module):
785
761
  self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
786
762
 
787
763
  def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
788
- """
789
- Forward function of contrastive learning.
764
+ """Forward function of contrastive learning.
790
765
 
791
766
  Args:
792
767
  x (torch.Tensor): Image features.
@@ -802,16 +777,14 @@ class ContrastiveHead(nn.Module):
802
777
 
803
778
 
804
779
  class BNContrastiveHead(nn.Module):
805
- """
806
- Batch Norm Contrastive Head using batch norm instead of l2-normalization.
780
+ """Batch Norm Contrastive Head using batch norm instead of l2-normalization.
807
781
 
808
782
  Args:
809
783
  embed_dims (int): Embed dimensions of text and image features.
810
784
  """
811
785
 
812
786
  def __init__(self, embed_dims: int):
813
- """
814
- Initialize BNContrastiveHead.
787
+ """Initialize BNContrastiveHead.
815
788
 
816
789
  Args:
817
790
  embed_dims (int): Embedding dimensions for features.
@@ -830,13 +803,13 @@ class BNContrastiveHead(nn.Module):
830
803
  del self.logit_scale
831
804
  self.forward = self.forward_fuse
832
805
 
833
- def forward_fuse(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
806
+ @staticmethod
807
+ def forward_fuse(x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
834
808
  """Passes input out unchanged."""
835
809
  return x
836
810
 
837
811
  def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
838
- """
839
- Forward function of contrastive learning with batch normalization.
812
+ """Forward function of contrastive learning with batch normalization.
840
813
 
841
814
  Args:
842
815
  x (torch.Tensor): Image features.
@@ -858,8 +831,7 @@ class RepBottleneck(Bottleneck):
858
831
  def __init__(
859
832
  self, c1: int, c2: int, shortcut: bool = True, g: int = 1, k: tuple[int, int] = (3, 3), e: float = 0.5
860
833
  ):
861
- """
862
- Initialize RepBottleneck.
834
+ """Initialize RepBottleneck.
863
835
 
864
836
  Args:
865
837
  c1 (int): Input channels.
@@ -878,8 +850,7 @@ class RepCSP(C3):
878
850
  """Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""
879
851
 
880
852
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
881
- """
882
- Initialize RepCSP layer.
853
+ """Initialize RepCSP layer.
883
854
 
884
855
  Args:
885
856
  c1 (int): Input channels.
@@ -898,8 +869,7 @@ class RepNCSPELAN4(nn.Module):
898
869
  """CSP-ELAN."""
899
870
 
900
871
  def __init__(self, c1: int, c2: int, c3: int, c4: int, n: int = 1):
901
- """
902
- Initialize CSP-ELAN layer.
872
+ """Initialize CSP-ELAN layer.
903
873
 
904
874
  Args:
905
875
  c1 (int): Input channels.
@@ -932,8 +902,7 @@ class ELAN1(RepNCSPELAN4):
932
902
  """ELAN1 module with 4 convolutions."""
933
903
 
934
904
  def __init__(self, c1: int, c2: int, c3: int, c4: int):
935
- """
936
- Initialize ELAN1 layer.
905
+ """Initialize ELAN1 layer.
937
906
 
938
907
  Args:
939
908
  c1 (int): Input channels.
@@ -953,8 +922,7 @@ class AConv(nn.Module):
953
922
  """AConv."""
954
923
 
955
924
  def __init__(self, c1: int, c2: int):
956
- """
957
- Initialize AConv module.
925
+ """Initialize AConv module.
958
926
 
959
927
  Args:
960
928
  c1 (int): Input channels.
@@ -973,8 +941,7 @@ class ADown(nn.Module):
973
941
  """ADown."""
974
942
 
975
943
  def __init__(self, c1: int, c2: int):
976
- """
977
- Initialize ADown module.
944
+ """Initialize ADown module.
978
945
 
979
946
  Args:
980
947
  c1 (int): Input channels.
@@ -999,8 +966,7 @@ class SPPELAN(nn.Module):
999
966
  """SPP-ELAN."""
1000
967
 
1001
968
  def __init__(self, c1: int, c2: int, c3: int, k: int = 5):
1002
- """
1003
- Initialize SPP-ELAN block.
969
+ """Initialize SPP-ELAN block.
1004
970
 
1005
971
  Args:
1006
972
  c1 (int): Input channels.
@@ -1027,8 +993,7 @@ class CBLinear(nn.Module):
1027
993
  """CBLinear."""
1028
994
 
1029
995
  def __init__(self, c1: int, c2s: list[int], k: int = 1, s: int = 1, p: int | None = None, g: int = 1):
1030
- """
1031
- Initialize CBLinear module.
996
+ """Initialize CBLinear module.
1032
997
 
1033
998
  Args:
1034
999
  c1 (int): Input channels.
@@ -1051,8 +1016,7 @@ class CBFuse(nn.Module):
1051
1016
  """CBFuse."""
1052
1017
 
1053
1018
  def __init__(self, idx: list[int]):
1054
- """
1055
- Initialize CBFuse module.
1019
+ """Initialize CBFuse module.
1056
1020
 
1057
1021
  Args:
1058
1022
  idx (list[int]): Indices for feature selection.
@@ -1061,8 +1025,7 @@ class CBFuse(nn.Module):
1061
1025
  self.idx = idx
1062
1026
 
1063
1027
  def forward(self, xs: list[torch.Tensor]) -> torch.Tensor:
1064
- """
1065
- Forward pass through CBFuse layer.
1028
+ """Forward pass through CBFuse layer.
1066
1029
 
1067
1030
  Args:
1068
1031
  xs (list[torch.Tensor]): List of input tensors.
@@ -1079,8 +1042,7 @@ class C3f(nn.Module):
1079
1042
  """Faster Implementation of CSP Bottleneck with 2 convolutions."""
1080
1043
 
1081
1044
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
1082
- """
1083
- Initialize CSP bottleneck layer with two convolutions.
1045
+ """Initialize CSP bottleneck layer with two convolutions.
1084
1046
 
1085
1047
  Args:
1086
1048
  c1 (int): Input channels.
@@ -1108,10 +1070,17 @@ class C3k2(C2f):
1108
1070
  """Faster Implementation of CSP Bottleneck with 2 convolutions."""
1109
1071
 
1110
1072
  def __init__(
1111
- self, c1: int, c2: int, n: int = 1, c3k: bool = False, e: float = 0.5, g: int = 1, shortcut: bool = True
1073
+ self,
1074
+ c1: int,
1075
+ c2: int,
1076
+ n: int = 1,
1077
+ c3k: bool = False,
1078
+ e: float = 0.5,
1079
+ attn: bool = False,
1080
+ g: int = 1,
1081
+ shortcut: bool = True,
1112
1082
  ):
1113
- """
1114
- Initialize C3k2 module.
1083
+ """Initialize C3k2 module.
1115
1084
 
1116
1085
  Args:
1117
1086
  c1 (int): Input channels.
@@ -1119,12 +1088,21 @@ class C3k2(C2f):
1119
1088
  n (int): Number of blocks.
1120
1089
  c3k (bool): Whether to use C3k blocks.
1121
1090
  e (float): Expansion ratio.
1091
+ attn (bool): Whether to use attention blocks.
1122
1092
  g (int): Groups for convolutions.
1123
1093
  shortcut (bool): Whether to use shortcut connections.
1124
1094
  """
1125
1095
  super().__init__(c1, c2, n, shortcut, g, e)
1126
1096
  self.m = nn.ModuleList(
1127
- C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
1097
+ nn.Sequential(
1098
+ Bottleneck(self.c, self.c, shortcut, g),
1099
+ PSABlock(self.c, attn_ratio=0.5, num_heads=max(self.c // 64, 1)),
1100
+ )
1101
+ if attn
1102
+ else C3k(self.c, self.c, 2, shortcut, g)
1103
+ if c3k
1104
+ else Bottleneck(self.c, self.c, shortcut, g)
1105
+ for _ in range(n)
1128
1106
  )
1129
1107
 
1130
1108
 
@@ -1132,8 +1110,7 @@ class C3k(C3):
1132
1110
  """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
1133
1111
 
1134
1112
  def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3):
1135
- """
1136
- Initialize C3k module.
1113
+ """Initialize C3k module.
1137
1114
 
1138
1115
  Args:
1139
1116
  c1 (int): Input channels.
@@ -1154,8 +1131,7 @@ class RepVGGDW(torch.nn.Module):
1154
1131
  """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
1155
1132
 
1156
1133
  def __init__(self, ed: int) -> None:
1157
- """
1158
- Initialize RepVGGDW module.
1134
+ """Initialize RepVGGDW module.
1159
1135
 
1160
1136
  Args:
1161
1137
  ed (int): Input and output channels.
@@ -1167,8 +1143,7 @@ class RepVGGDW(torch.nn.Module):
1167
1143
  self.act = nn.SiLU()
1168
1144
 
1169
1145
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1170
- """
1171
- Perform a forward pass of the RepVGGDW block.
1146
+ """Perform a forward pass of the RepVGGDW block.
1172
1147
 
1173
1148
  Args:
1174
1149
  x (torch.Tensor): Input tensor.
@@ -1179,8 +1154,7 @@ class RepVGGDW(torch.nn.Module):
1179
1154
  return self.act(self.conv(x) + self.conv1(x))
1180
1155
 
1181
1156
  def forward_fuse(self, x: torch.Tensor) -> torch.Tensor:
1182
- """
1183
- Perform a forward pass of the RepVGGDW block without fusing the convolutions.
1157
+ """Perform a forward pass of the RepVGGDW block without fusing the convolutions.
1184
1158
 
1185
1159
  Args:
1186
1160
  x (torch.Tensor): Input tensor.
@@ -1192,11 +1166,12 @@ class RepVGGDW(torch.nn.Module):
1192
1166
 
1193
1167
  @torch.no_grad()
1194
1168
  def fuse(self):
1195
- """
1196
- Fuse the convolutional layers in the RepVGGDW block.
1169
+ """Fuse the convolutional layers in the RepVGGDW block.
1197
1170
 
1198
1171
  This method fuses the convolutional layers and updates the weights and biases accordingly.
1199
1172
  """
1173
+ if not hasattr(self, "conv1"):
1174
+ return # already fused
1200
1175
  conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
1201
1176
  conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)
1202
1177
 
@@ -1218,8 +1193,7 @@ class RepVGGDW(torch.nn.Module):
1218
1193
 
1219
1194
 
1220
1195
  class CIB(nn.Module):
1221
- """
1222
- Conditional Identity Block (CIB) module.
1196
+ """Compact Inverted Block (CIB) module.
1223
1197
 
1224
1198
  Args:
1225
1199
  c1 (int): Number of input channels.
@@ -1230,8 +1204,7 @@ class CIB(nn.Module):
1230
1204
  """
1231
1205
 
1232
1206
  def __init__(self, c1: int, c2: int, shortcut: bool = True, e: float = 0.5, lk: bool = False):
1233
- """
1234
- Initialize the CIB module.
1207
+ """Initialize the CIB module.
1235
1208
 
1236
1209
  Args:
1237
1210
  c1 (int): Input channels.
@@ -1253,8 +1226,7 @@ class CIB(nn.Module):
1253
1226
  self.add = shortcut and c1 == c2
1254
1227
 
1255
1228
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1256
- """
1257
- Forward pass of the CIB module.
1229
+ """Forward pass of the CIB module.
1258
1230
 
1259
1231
  Args:
1260
1232
  x (torch.Tensor): Input tensor.
@@ -1266,15 +1238,14 @@ class CIB(nn.Module):
1266
1238
 
1267
1239
 
1268
1240
  class C2fCIB(C2f):
1269
- """
1270
- C2fCIB class represents a convolutional block with C2f and CIB modules.
1241
+ """C2fCIB class represents a convolutional block with C2f and CIB modules.
1271
1242
 
1272
1243
  Args:
1273
1244
  c1 (int): Number of input channels.
1274
1245
  c2 (int): Number of output channels.
1275
1246
  n (int, optional): Number of CIB modules to stack. Defaults to 1.
1276
1247
  shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
1277
- lk (bool, optional): Whether to use local key connection. Defaults to False.
1248
+ lk (bool, optional): Whether to use large kernel. Defaults to False.
1278
1249
  g (int, optional): Number of groups for grouped convolution. Defaults to 1.
1279
1250
  e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
1280
1251
  """
@@ -1282,15 +1253,14 @@ class C2fCIB(C2f):
1282
1253
  def __init__(
1283
1254
  self, c1: int, c2: int, n: int = 1, shortcut: bool = False, lk: bool = False, g: int = 1, e: float = 0.5
1284
1255
  ):
1285
- """
1286
- Initialize C2fCIB module.
1256
+ """Initialize C2fCIB module.
1287
1257
 
1288
1258
  Args:
1289
1259
  c1 (int): Input channels.
1290
1260
  c2 (int): Output channels.
1291
1261
  n (int): Number of CIB modules.
1292
1262
  shortcut (bool): Whether to use shortcut connection.
1293
- lk (bool): Whether to use local key connection.
1263
+ lk (bool): Whether to use large kernel.
1294
1264
  g (int): Groups for convolutions.
1295
1265
  e (float): Expansion ratio.
1296
1266
  """
@@ -1299,8 +1269,7 @@ class C2fCIB(C2f):
1299
1269
 
1300
1270
 
1301
1271
  class Attention(nn.Module):
1302
- """
1303
- Attention module that performs self-attention on the input tensor.
1272
+ """Attention module that performs self-attention on the input tensor.
1304
1273
 
1305
1274
  Args:
1306
1275
  dim (int): The input tensor dimension.
@@ -1318,8 +1287,7 @@ class Attention(nn.Module):
1318
1287
  """
1319
1288
 
1320
1289
  def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5):
1321
- """
1322
- Initialize multi-head attention module.
1290
+ """Initialize multi-head attention module.
1323
1291
 
1324
1292
  Args:
1325
1293
  dim (int): Input dimension.
@@ -1338,8 +1306,7 @@ class Attention(nn.Module):
1338
1306
  self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
1339
1307
 
1340
1308
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1341
- """
1342
- Forward pass of the Attention module.
1309
+ """Forward pass of the Attention module.
1343
1310
 
1344
1311
  Args:
1345
1312
  x (torch.Tensor): The input tensor.
@@ -1362,8 +1329,7 @@ class Attention(nn.Module):
1362
1329
 
1363
1330
 
1364
1331
  class PSABlock(nn.Module):
1365
- """
1366
- PSABlock class implementing a Position-Sensitive Attention block for neural networks.
1332
+ """PSABlock class implementing a Position-Sensitive Attention block for neural networks.
1367
1333
 
1368
1334
  This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
1369
1335
  with optional shortcut connections.
@@ -1384,8 +1350,7 @@ class PSABlock(nn.Module):
1384
1350
  """
1385
1351
 
1386
1352
  def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True) -> None:
1387
- """
1388
- Initialize the PSABlock.
1353
+ """Initialize the PSABlock.
1389
1354
 
1390
1355
  Args:
1391
1356
  c (int): Input and output channels.
@@ -1400,8 +1365,7 @@ class PSABlock(nn.Module):
1400
1365
  self.add = shortcut
1401
1366
 
1402
1367
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1403
- """
1404
- Execute a forward pass through PSABlock.
1368
+ """Execute a forward pass through PSABlock.
1405
1369
 
1406
1370
  Args:
1407
1371
  x (torch.Tensor): Input tensor.
@@ -1415,8 +1379,7 @@ class PSABlock(nn.Module):
1415
1379
 
1416
1380
 
1417
1381
  class PSA(nn.Module):
1418
- """
1419
- PSA class for implementing Position-Sensitive Attention in neural networks.
1382
+ """PSA class for implementing Position-Sensitive Attention in neural networks.
1420
1383
 
1421
1384
  This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
1422
1385
  input tensors, enhancing feature extraction and processing capabilities.
@@ -1439,8 +1402,7 @@ class PSA(nn.Module):
1439
1402
  """
1440
1403
 
1441
1404
  def __init__(self, c1: int, c2: int, e: float = 0.5):
1442
- """
1443
- Initialize PSA module.
1405
+ """Initialize PSA module.
1444
1406
 
1445
1407
  Args:
1446
1408
  c1 (int): Input channels.
@@ -1453,12 +1415,11 @@ class PSA(nn.Module):
1453
1415
  self.cv1 = Conv(c1, 2 * self.c, 1, 1)
1454
1416
  self.cv2 = Conv(2 * self.c, c1, 1)
1455
1417
 
1456
- self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
1418
+ self.attn = Attention(self.c, attn_ratio=0.5, num_heads=max(self.c // 64, 1))
1457
1419
  self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
1458
1420
 
1459
1421
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1460
- """
1461
- Execute forward pass in PSA module.
1422
+ """Execute forward pass in PSA module.
1462
1423
 
1463
1424
  Args:
1464
1425
  x (torch.Tensor): Input tensor.
@@ -1473,8 +1434,7 @@ class PSA(nn.Module):
1473
1434
 
1474
1435
 
1475
1436
  class C2PSA(nn.Module):
1476
- """
1477
- C2PSA module with attention mechanism for enhanced feature extraction and processing.
1437
+ """C2PSA module with attention mechanism for enhanced feature extraction and processing.
1478
1438
 
1479
1439
  This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
1480
1440
  capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
@@ -1488,18 +1448,17 @@ class C2PSA(nn.Module):
1488
1448
  Methods:
1489
1449
  forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
1490
1450
 
1491
- Notes:
1492
- This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
1493
-
1494
1451
  Examples:
1495
1452
  >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
1496
1453
  >>> input_tensor = torch.randn(1, 256, 64, 64)
1497
1454
  >>> output_tensor = c2psa(input_tensor)
1455
+
1456
+ Notes:
1457
+ This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
1498
1458
  """
1499
1459
 
1500
1460
  def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
1501
- """
1502
- Initialize C2PSA module.
1461
+ """Initialize C2PSA module.
1503
1462
 
1504
1463
  Args:
1505
1464
  c1 (int): Input channels.
@@ -1516,8 +1475,7 @@ class C2PSA(nn.Module):
1516
1475
  self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
1517
1476
 
1518
1477
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1519
- """
1520
- Process the input tensor through a series of PSA blocks.
1478
+ """Process the input tensor through a series of PSA blocks.
1521
1479
 
1522
1480
  Args:
1523
1481
  x (torch.Tensor): Input tensor.
@@ -1531,10 +1489,10 @@ class C2PSA(nn.Module):
1531
1489
 
1532
1490
 
1533
1491
  class C2fPSA(C2f):
1534
- """
1535
- C2fPSA module with enhanced feature extraction using PSA blocks.
1492
+ """C2fPSA module with enhanced feature extraction using PSA blocks.
1536
1493
 
1537
- This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
1494
+ This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature
1495
+ extraction.
1538
1496
 
1539
1497
  Attributes:
1540
1498
  c (int): Number of hidden channels.
@@ -1556,8 +1514,7 @@ class C2fPSA(C2f):
1556
1514
  """
1557
1515
 
1558
1516
  def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
1559
- """
1560
- Initialize C2fPSA module.
1517
+ """Initialize C2fPSA module.
1561
1518
 
1562
1519
  Args:
1563
1520
  c1 (int): Input channels.
@@ -1571,8 +1528,7 @@ class C2fPSA(C2f):
1571
1528
 
1572
1529
 
1573
1530
  class SCDown(nn.Module):
1574
- """
1575
- SCDown module for downsampling with separable convolutions.
1531
+ """SCDown module for downsampling with separable convolutions.
1576
1532
 
1577
1533
  This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
1578
1534
  efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
@@ -1595,8 +1551,7 @@ class SCDown(nn.Module):
1595
1551
  """
1596
1552
 
1597
1553
  def __init__(self, c1: int, c2: int, k: int, s: int):
1598
- """
1599
- Initialize SCDown module.
1554
+ """Initialize SCDown module.
1600
1555
 
1601
1556
  Args:
1602
1557
  c1 (int): Input channels.
@@ -1609,8 +1564,7 @@ class SCDown(nn.Module):
1609
1564
  self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
1610
1565
 
1611
1566
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1612
- """
1613
- Apply convolution and downsampling to the input tensor.
1567
+ """Apply convolution and downsampling to the input tensor.
1614
1568
 
1615
1569
  Args:
1616
1570
  x (torch.Tensor): Input tensor.
@@ -1622,27 +1576,26 @@ class SCDown(nn.Module):
1622
1576
 
1623
1577
 
1624
1578
  class TorchVision(nn.Module):
1625
- """
1626
- TorchVision module to allow loading any torchvision model.
1579
+ """TorchVision module to allow loading any torchvision model.
1627
1580
 
1628
- This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and customize the model by truncating or unwrapping layers.
1629
-
1630
- Attributes:
1631
- m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.
1581
+ This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and
1582
+ customize the model by truncating or unwrapping layers.
1632
1583
 
1633
1584
  Args:
1634
1585
  model (str): Name of the torchvision model to load.
1635
1586
  weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
1636
- unwrap (bool, optional): If True, unwraps the model to a sequential containing all but the last `truncate` layers. Default is True.
1587
+ unwrap (bool, optional): Unwraps the model to a sequential containing all but the last `truncate` layers.
1637
1588
  truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
1638
1589
  split (bool, optional): Returns output from intermediate child modules as list. Default is False.
1590
+
1591
+ Attributes:
1592
+ m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.
1639
1593
  """
1640
1594
 
1641
1595
  def __init__(
1642
1596
  self, model: str, weights: str = "DEFAULT", unwrap: bool = True, truncate: int = 2, split: bool = False
1643
1597
  ):
1644
- """
1645
- Load the model and weights from torchvision.
1598
+ """Load the model and weights from torchvision.
1646
1599
 
1647
1600
  Args:
1648
1601
  model (str): Name of the torchvision model to load.
@@ -1669,8 +1622,7 @@ class TorchVision(nn.Module):
1669
1622
  self.m.head = self.m.heads = nn.Identity()
1670
1623
 
1671
1624
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1672
- """
1673
- Forward pass through the model.
1625
+ """Forward pass through the model.
1674
1626
 
1675
1627
  Args:
1676
1628
  x (torch.Tensor): Input tensor.
@@ -1687,8 +1639,7 @@ class TorchVision(nn.Module):
1687
1639
 
1688
1640
 
1689
1641
  class AAttn(nn.Module):
1690
- """
1691
- Area-attention module for YOLO models, providing efficient attention mechanisms.
1642
+ """Area-attention module for YOLO models, providing efficient attention mechanisms.
1692
1643
 
1693
1644
  This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
1694
1645
  making it particularly effective for object detection tasks.
@@ -1713,8 +1664,7 @@ class AAttn(nn.Module):
1713
1664
  """
1714
1665
 
1715
1666
  def __init__(self, dim: int, num_heads: int, area: int = 1):
1716
- """
1717
- Initialize an Area-attention module for YOLO models.
1667
+ """Initialize an Area-attention module for YOLO models.
1718
1668
 
1719
1669
  Args:
1720
1670
  dim (int): Number of hidden channels.
@@ -1733,8 +1683,7 @@ class AAttn(nn.Module):
1733
1683
  self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
1734
1684
 
1735
1685
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1736
- """
1737
- Process the input tensor through the area-attention.
1686
+ """Process the input tensor through the area-attention.
1738
1687
 
1739
1688
  Args:
1740
1689
  x (torch.Tensor): Input tensor.
@@ -1773,8 +1722,7 @@ class AAttn(nn.Module):
1773
1722
 
1774
1723
 
1775
1724
  class ABlock(nn.Module):
1776
- """
1777
- Area-attention block module for efficient feature extraction in YOLO models.
1725
+ """Area-attention block module for efficient feature extraction in YOLO models.
1778
1726
 
1779
1727
  This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
1780
1728
  It uses a novel area-based attention approach that is more efficient than traditional self-attention while
@@ -1797,8 +1745,7 @@ class ABlock(nn.Module):
1797
1745
  """
1798
1746
 
1799
1747
  def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 1.2, area: int = 1):
1800
- """
1801
- Initialize an Area-attention block module.
1748
+ """Initialize an Area-attention block module.
1802
1749
 
1803
1750
  Args:
1804
1751
  dim (int): Number of input channels.
@@ -1814,9 +1761,9 @@ class ABlock(nn.Module):
1814
1761
 
1815
1762
  self.apply(self._init_weights)
1816
1763
 
1817
- def _init_weights(self, m: nn.Module):
1818
- """
1819
- Initialize weights using a truncated normal distribution.
1764
+ @staticmethod
1765
+ def _init_weights(m: nn.Module):
1766
+ """Initialize weights using a truncated normal distribution.
1820
1767
 
1821
1768
  Args:
1822
1769
  m (nn.Module): Module to initialize.
@@ -1827,8 +1774,7 @@ class ABlock(nn.Module):
1827
1774
  nn.init.constant_(m.bias, 0)
1828
1775
 
1829
1776
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1830
- """
1831
- Forward pass through ABlock.
1777
+ """Forward pass through ABlock.
1832
1778
 
1833
1779
  Args:
1834
1780
  x (torch.Tensor): Input tensor.
@@ -1841,8 +1787,7 @@ class ABlock(nn.Module):
1841
1787
 
1842
1788
 
1843
1789
  class A2C2f(nn.Module):
1844
- """
1845
- Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
1790
+ """Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
1846
1791
 
1847
1792
  This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
1848
1793
  processing. It supports both area-attention and standard convolution modes.
@@ -1877,8 +1822,7 @@ class A2C2f(nn.Module):
1877
1822
  g: int = 1,
1878
1823
  shortcut: bool = True,
1879
1824
  ):
1880
- """
1881
- Initialize Area-Attention C2f module.
1825
+ """Initialize Area-Attention C2f module.
1882
1826
 
1883
1827
  Args:
1884
1828
  c1 (int): Number of input channels.
@@ -1894,7 +1838,7 @@ class A2C2f(nn.Module):
1894
1838
  """
1895
1839
  super().__init__()
1896
1840
  c_ = int(c2 * e) # hidden channels
1897
- assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
1841
+ assert c_ % 32 == 0, "Dimension of ABlock must be a multiple of 32."
1898
1842
 
1899
1843
  self.cv1 = Conv(c1, c_, 1, 1)
1900
1844
  self.cv2 = Conv((1 + n) * c_, c2, 1)
@@ -1908,8 +1852,7 @@ class A2C2f(nn.Module):
1908
1852
  )
1909
1853
 
1910
1854
  def forward(self, x: torch.Tensor) -> torch.Tensor:
1911
- """
1912
- Forward pass through A2C2f layer.
1855
+ """Forward pass through A2C2f layer.
1913
1856
 
1914
1857
  Args:
1915
1858
  x (torch.Tensor): Input tensor.
@@ -1929,8 +1872,7 @@ class SwiGLUFFN(nn.Module):
1929
1872
  """SwiGLU Feed-Forward Network for transformer-based architectures."""
1930
1873
 
1931
1874
  def __init__(self, gc: int, ec: int, e: int = 4) -> None:
1932
- """
1933
- Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.
1875
+ """Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.
1934
1876
 
1935
1877
  Args:
1936
1878
  gc (int): Guide channels.
@@ -1953,8 +1895,7 @@ class Residual(nn.Module):
1953
1895
  """Residual connection wrapper for neural network modules."""
1954
1896
 
1955
1897
  def __init__(self, m: nn.Module) -> None:
1956
- """
1957
- Initialize residual module with the wrapped module.
1898
+ """Initialize residual module with the wrapped module.
1958
1899
 
1959
1900
  Args:
1960
1901
  m (nn.Module): Module to wrap with residual connection.
@@ -1975,8 +1916,7 @@ class SAVPE(nn.Module):
1975
1916
  """Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
1976
1917
 
1977
1918
  def __init__(self, ch: list[int], c3: int, embed: int):
1978
- """
1979
- Initialize SAVPE module with channels, intermediate channels, and embedding dimension.
1919
+ """Initialize SAVPE module with channels, intermediate channels, and embedding dimension.
1980
1920
 
1981
1921
  Args:
1982
1922
  ch (list[int]): List of input channel dimensions.
@@ -2029,3 +1969,99 @@ class SAVPE(nn.Module):
2029
1969
  aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
2030
1970
 
2031
1971
  return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
1972
+
1973
+
1974
+ class Proto26(Proto):
1975
+ """Ultralytics YOLO26 models mask Proto module for segmentation models."""
1976
+
1977
+ def __init__(self, ch: tuple = (), c_: int = 256, c2: int = 32, nc: int = 80):
1978
+ """Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.
1979
+
1980
+ Args:
1981
+ ch (tuple): Tuple of channel sizes from backbone feature maps.
1982
+ c_ (int): Intermediate channels.
1983
+ c2 (int): Output channels (number of protos).
1984
+ nc (int): Number of classes for semantic segmentation.
1985
+ """
1986
+ super().__init__(c_, c_, c2)
1987
+ self.feat_refine = nn.ModuleList(Conv(x, ch[0], k=1) for x in ch[1:])
1988
+ self.feat_fuse = Conv(ch[0], c_, k=3)
1989
+ self.semseg = nn.Sequential(Conv(ch[0], c_, k=3), Conv(c_, c_, k=3), nn.Conv2d(c_, nc, 1))
1990
+
1991
+ def forward(self, x: torch.Tensor, return_semseg: bool = True) -> torch.Tensor:
1992
+ """Perform a forward pass through layers using an upsampled input image."""
1993
+ feat = x[0]
1994
+ for i, f in enumerate(self.feat_refine):
1995
+ up_feat = f(x[i + 1])
1996
+ up_feat = F.interpolate(up_feat, size=feat.shape[2:], mode="nearest")
1997
+ feat = feat + up_feat
1998
+ p = super().forward(self.feat_fuse(feat))
1999
+ if self.training and return_semseg:
2000
+ semseg = self.semseg(feat)
2001
+ return (p, semseg)
2002
+ return p
2003
+
2004
+ def fuse(self):
2005
+ """Fuse the model for inference by removing the semantic segmentation head."""
2006
+ self.semseg = None
2007
+
2008
+
2009
+ class RealNVP(nn.Module):
2010
+ """RealNVP: a flow-based generative model.
2011
+
2012
+ References:
2013
+ https://arxiv.org/abs/1605.08803
2014
+ https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/utils/realnvp.py
2015
+ """
2016
+
2017
+ @staticmethod
2018
+ def nets():
2019
+ """Get the scale model in a single invertable mapping."""
2020
+ return nn.Sequential(nn.Linear(2, 64), nn.SiLU(), nn.Linear(64, 64), nn.SiLU(), nn.Linear(64, 2), nn.Tanh())
2021
+
2022
+ @staticmethod
2023
+ def nett():
2024
+ """Get the translation model in a single invertable mapping."""
2025
+ return nn.Sequential(nn.Linear(2, 64), nn.SiLU(), nn.Linear(64, 64), nn.SiLU(), nn.Linear(64, 2))
2026
+
2027
+ @property
2028
+ def prior(self):
2029
+ """The prior distribution."""
2030
+ return torch.distributions.MultivariateNormal(self.loc, self.cov)
2031
+
2032
+ def __init__(self):
2033
+ super().__init__()
2034
+
2035
+ self.register_buffer("loc", torch.zeros(2))
2036
+ self.register_buffer("cov", torch.eye(2))
2037
+ self.register_buffer("mask", torch.tensor([[0, 1], [1, 0]] * 3, dtype=torch.float32))
2038
+
2039
+ self.s = torch.nn.ModuleList([self.nets() for _ in range(len(self.mask))])
2040
+ self.t = torch.nn.ModuleList([self.nett() for _ in range(len(self.mask))])
2041
+ self.init_weights()
2042
+
2043
+ def init_weights(self):
2044
+ """Initialization model weights."""
2045
+ for m in self.modules():
2046
+ if isinstance(m, nn.Linear):
2047
+ nn.init.xavier_uniform_(m.weight, gain=0.01)
2048
+
2049
+ def backward_p(self, x):
2050
+ """Apply mapping form the data space to the latent space and calculate the log determinant of the Jacobian
2051
+ matrix.
2052
+ """
2053
+ log_det_jacob, z = x.new_zeros(x.shape[0]), x
2054
+ for i in reversed(range(len(self.t))):
2055
+ z_ = self.mask[i] * z
2056
+ s = self.s[i](z_) * (1 - self.mask[i])
2057
+ t = self.t[i](z_) * (1 - self.mask[i])
2058
+ z = (1 - self.mask[i]) * (z - t) * torch.exp(-s) + z_
2059
+ log_det_jacob -= s.sum(dim=1)
2060
+ return z, log_det_jacob
2061
+
2062
+ def log_prob(self, x):
2063
+ """Calculate the log probability of given sample in data space."""
2064
+ if x.dtype == torch.float32 and self.s[0][0].weight.dtype != torch.float32:
2065
+ self.float()
2066
+ z, log_det = self.backward_p(x)
2067
+ return self.prior.log_prob(z) + log_det