ultralytics 8.3.89__py3-none-any.whl → 8.3.91__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. tests/conftest.py +2 -2
  2. tests/test_cli.py +13 -11
  3. tests/test_cuda.py +10 -1
  4. tests/test_exports.py +2 -2
  5. tests/test_integrations.py +1 -5
  6. tests/test_python.py +16 -16
  7. tests/test_solutions.py +9 -9
  8. ultralytics/__init__.py +1 -1
  9. ultralytics/cfg/__init__.py +3 -1
  10. ultralytics/cfg/models/11/yolo11-cls.yaml +5 -5
  11. ultralytics/cfg/models/11/yolo11-obb.yaml +5 -5
  12. ultralytics/cfg/models/11/yolo11-pose.yaml +5 -5
  13. ultralytics/cfg/models/11/yolo11-seg.yaml +5 -5
  14. ultralytics/cfg/models/11/yolo11.yaml +5 -5
  15. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +5 -5
  16. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +5 -5
  17. ultralytics/cfg/models/v8/yolov8-ghost.yaml +5 -5
  18. ultralytics/cfg/models/v8/yolov8-obb.yaml +5 -5
  19. ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
  20. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +5 -5
  21. ultralytics/cfg/models/v8/yolov8-world.yaml +5 -5
  22. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +5 -5
  23. ultralytics/cfg/models/v8/yolov8.yaml +5 -5
  24. ultralytics/cfg/models/v9/yolov9c-seg.yaml +1 -1
  25. ultralytics/cfg/models/v9/yolov9c.yaml +1 -1
  26. ultralytics/cfg/models/v9/yolov9e-seg.yaml +1 -1
  27. ultralytics/cfg/models/v9/yolov9e.yaml +1 -1
  28. ultralytics/cfg/models/v9/yolov9m.yaml +1 -1
  29. ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
  30. ultralytics/cfg/models/v9/yolov9t.yaml +1 -1
  31. ultralytics/data/annotator.py +9 -14
  32. ultralytics/data/base.py +118 -30
  33. ultralytics/data/build.py +63 -24
  34. ultralytics/data/converter.py +5 -5
  35. ultralytics/data/dataset.py +207 -53
  36. ultralytics/data/loaders.py +1 -0
  37. ultralytics/data/split_dota.py +39 -12
  38. ultralytics/data/utils.py +15 -19
  39. ultralytics/engine/exporter.py +24 -23
  40. ultralytics/engine/model.py +67 -88
  41. ultralytics/engine/predictor.py +106 -21
  42. ultralytics/engine/trainer.py +32 -23
  43. ultralytics/engine/tuner.py +21 -18
  44. ultralytics/engine/validator.py +75 -41
  45. ultralytics/hub/__init__.py +12 -13
  46. ultralytics/hub/auth.py +9 -12
  47. ultralytics/hub/session.py +76 -21
  48. ultralytics/hub/utils.py +19 -17
  49. ultralytics/models/fastsam/model.py +20 -11
  50. ultralytics/models/fastsam/predict.py +36 -16
  51. ultralytics/models/fastsam/utils.py +5 -5
  52. ultralytics/models/fastsam/val.py +6 -6
  53. ultralytics/models/nas/model.py +22 -11
  54. ultralytics/models/nas/predict.py +9 -4
  55. ultralytics/models/nas/val.py +5 -5
  56. ultralytics/models/rtdetr/model.py +20 -11
  57. ultralytics/models/rtdetr/predict.py +18 -15
  58. ultralytics/models/rtdetr/train.py +20 -16
  59. ultralytics/models/rtdetr/val.py +42 -6
  60. ultralytics/models/sam/__init__.py +1 -1
  61. ultralytics/models/sam/amg.py +50 -4
  62. ultralytics/models/sam/model.py +8 -14
  63. ultralytics/models/sam/modules/decoders.py +18 -21
  64. ultralytics/models/sam/modules/encoders.py +25 -46
  65. ultralytics/models/sam/modules/memory_attention.py +19 -15
  66. ultralytics/models/sam/modules/sam.py +18 -25
  67. ultralytics/models/sam/modules/tiny_encoder.py +19 -29
  68. ultralytics/models/sam/modules/transformer.py +35 -57
  69. ultralytics/models/sam/modules/utils.py +15 -15
  70. ultralytics/models/sam/predict.py +0 -3
  71. ultralytics/models/utils/loss.py +87 -36
  72. ultralytics/models/utils/ops.py +26 -31
  73. ultralytics/models/yolo/classify/predict.py +24 -3
  74. ultralytics/models/yolo/classify/train.py +77 -10
  75. ultralytics/models/yolo/classify/val.py +40 -15
  76. ultralytics/models/yolo/detect/predict.py +23 -10
  77. ultralytics/models/yolo/detect/train.py +85 -15
  78. ultralytics/models/yolo/detect/val.py +145 -21
  79. ultralytics/models/yolo/model.py +1 -2
  80. ultralytics/models/yolo/obb/predict.py +12 -4
  81. ultralytics/models/yolo/obb/train.py +7 -0
  82. ultralytics/models/yolo/obb/val.py +25 -7
  83. ultralytics/models/yolo/pose/predict.py +22 -6
  84. ultralytics/models/yolo/pose/train.py +17 -1
  85. ultralytics/models/yolo/pose/val.py +46 -21
  86. ultralytics/models/yolo/segment/predict.py +22 -8
  87. ultralytics/models/yolo/segment/train.py +6 -0
  88. ultralytics/models/yolo/segment/val.py +100 -14
  89. ultralytics/models/yolo/world/train.py +38 -8
  90. ultralytics/models/yolo/world/train_world.py +39 -10
  91. ultralytics/nn/autobackend.py +28 -14
  92. ultralytics/nn/modules/__init__.py +3 -0
  93. ultralytics/nn/modules/activation.py +12 -3
  94. ultralytics/nn/modules/block.py +587 -84
  95. ultralytics/nn/modules/conv.py +418 -54
  96. ultralytics/nn/modules/head.py +3 -4
  97. ultralytics/nn/modules/transformer.py +320 -34
  98. ultralytics/nn/modules/utils.py +17 -3
  99. ultralytics/nn/tasks.py +221 -69
  100. ultralytics/solutions/ai_gym.py +2 -2
  101. ultralytics/solutions/analytics.py +4 -4
  102. ultralytics/solutions/heatmap.py +4 -4
  103. ultralytics/solutions/instance_segmentation.py +10 -4
  104. ultralytics/solutions/object_blurrer.py +2 -2
  105. ultralytics/solutions/object_counter.py +2 -2
  106. ultralytics/solutions/object_cropper.py +2 -2
  107. ultralytics/solutions/parking_management.py +9 -9
  108. ultralytics/solutions/queue_management.py +1 -1
  109. ultralytics/solutions/region_counter.py +2 -2
  110. ultralytics/solutions/security_alarm.py +7 -7
  111. ultralytics/solutions/solutions.py +7 -4
  112. ultralytics/solutions/speed_estimation.py +2 -2
  113. ultralytics/solutions/streamlit_inference.py +6 -6
  114. ultralytics/solutions/trackzone.py +9 -2
  115. ultralytics/solutions/vision_eye.py +4 -4
  116. ultralytics/trackers/basetrack.py +1 -1
  117. ultralytics/trackers/bot_sort.py +23 -22
  118. ultralytics/trackers/byte_tracker.py +4 -4
  119. ultralytics/trackers/track.py +2 -1
  120. ultralytics/trackers/utils/gmc.py +26 -27
  121. ultralytics/trackers/utils/kalman_filter.py +31 -29
  122. ultralytics/trackers/utils/matching.py +7 -7
  123. ultralytics/utils/__init__.py +32 -27
  124. ultralytics/utils/autobatch.py +5 -5
  125. ultralytics/utils/benchmarks.py +111 -18
  126. ultralytics/utils/callbacks/base.py +3 -3
  127. ultralytics/utils/callbacks/clearml.py +11 -11
  128. ultralytics/utils/callbacks/comet.py +42 -24
  129. ultralytics/utils/callbacks/dvc.py +11 -10
  130. ultralytics/utils/callbacks/hub.py +8 -8
  131. ultralytics/utils/callbacks/mlflow.py +1 -1
  132. ultralytics/utils/callbacks/neptune.py +12 -10
  133. ultralytics/utils/callbacks/raytune.py +1 -1
  134. ultralytics/utils/callbacks/tensorboard.py +6 -6
  135. ultralytics/utils/callbacks/wb.py +16 -16
  136. ultralytics/utils/checks.py +116 -35
  137. ultralytics/utils/dist.py +15 -2
  138. ultralytics/utils/downloads.py +13 -9
  139. ultralytics/utils/files.py +12 -13
  140. ultralytics/utils/instance.py +112 -45
  141. ultralytics/utils/loss.py +28 -33
  142. ultralytics/utils/metrics.py +246 -181
  143. ultralytics/utils/ops.py +61 -53
  144. ultralytics/utils/patches.py +8 -6
  145. ultralytics/utils/plotting.py +65 -45
  146. ultralytics/utils/tal.py +88 -57
  147. ultralytics/utils/torch_utils.py +181 -33
  148. ultralytics/utils/triton.py +13 -3
  149. ultralytics/utils/tuner.py +8 -16
  150. {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/METADATA +1 -1
  151. ultralytics-8.3.91.dist-info/RECORD +250 -0
  152. ultralytics-8.3.89.dist-info/RECORD +0 -250
  153. {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/LICENSE +0 -0
  154. {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/WHEEL +0 -0
  155. {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/entry_points.txt +0 -0
  156. {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/top_level.txt +0 -0
@@ -69,7 +69,7 @@ class DFL(nn.Module):
69
69
  self.c1 = c1
70
70
 
71
71
  def forward(self, x):
72
- """Applies a transformer layer on input tensor 'x' and returns a tensor."""
72
+ """Apply the DFL module to input tensor and return transformed output."""
73
73
  b, _, a = x.shape # batch, channels, anchors
74
74
  return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
75
75
  # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
@@ -80,9 +80,12 @@ class Proto(nn.Module):
80
80
 
81
81
  def __init__(self, c1, c_=256, c2=32):
82
82
  """
83
- Initializes the YOLOv8 mask Proto module with specified number of protos and masks.
83
+ Initialize the YOLOv8 mask Proto module with specified number of protos and masks.
84
84
 
85
- Input arguments are ch_in, number of protos, number of masks.
85
+ Args:
86
+ c1 (int): Input channels.
87
+ c_ (int): Intermediate channels.
88
+ c2 (int): Output channels (number of protos).
86
89
  """
87
90
  super().__init__()
88
91
  self.cv1 = Conv(c1, c_, k=3)
@@ -91,7 +94,7 @@ class Proto(nn.Module):
91
94
  self.cv3 = Conv(c_, c2)
92
95
 
93
96
  def forward(self, x):
94
- """Performs a forward pass through layers using an upsampled input image."""
97
+ """Perform a forward pass through layers using an upsampled input image."""
95
98
  return self.cv3(self.cv2(self.upsample(self.cv1(x))))
96
99
 
97
100
 
@@ -103,7 +106,14 @@ class HGStem(nn.Module):
103
106
  """
104
107
 
105
108
  def __init__(self, c1, cm, c2):
106
- """Initialize the SPP layer with input/output channels and specified kernel sizes for max pooling."""
109
+ """
110
+ Initialize the StemBlock of PPHGNetV2.
111
+
112
+ Args:
113
+ c1 (int): Input channels.
114
+ cm (int): Middle channels.
115
+ c2 (int): Output channels.
116
+ """
107
117
  super().__init__()
108
118
  self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
109
119
  self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
@@ -134,7 +144,19 @@ class HGBlock(nn.Module):
134
144
  """
135
145
 
136
146
  def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
137
- """Initializes a CSP Bottleneck with 1 convolution using specified input and output channels."""
147
+ """
148
+ Initialize HGBlock with specified parameters.
149
+
150
+ Args:
151
+ c1 (int): Input channels.
152
+ cm (int): Middle channels.
153
+ c2 (int): Output channels.
154
+ k (int): Kernel size.
155
+ n (int): Number of LightConv or Conv blocks.
156
+ lightconv (bool): Whether to use LightConv.
157
+ shortcut (bool): Whether to use shortcut connection.
158
+ act (nn.Module): Activation function.
159
+ """
138
160
  super().__init__()
139
161
  block = LightConv if lightconv else Conv
140
162
  self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
@@ -154,7 +176,14 @@ class SPP(nn.Module):
154
176
  """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
155
177
 
156
178
  def __init__(self, c1, c2, k=(5, 9, 13)):
157
- """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
179
+ """
180
+ Initialize the SPP layer with input/output channels and pooling kernel sizes.
181
+
182
+ Args:
183
+ c1 (int): Input channels.
184
+ c2 (int): Output channels.
185
+ k (Tuple[int, int, int]): Kernel sizes for max pooling.
186
+ """
158
187
  super().__init__()
159
188
  c_ = c1 // 2 # hidden channels
160
189
  self.cv1 = Conv(c1, c_, 1, 1)
@@ -172,9 +201,15 @@ class SPPF(nn.Module):
172
201
 
173
202
  def __init__(self, c1, c2, k=5):
174
203
  """
175
- Initializes the SPPF layer with given input/output channels and kernel size.
204
+ Initialize the SPPF layer with given input/output channels and kernel size.
176
205
 
177
- This module is equivalent to SPP(k=(5, 9, 13)).
206
+ Args:
207
+ c1 (int): Input channels.
208
+ c2 (int): Output channels.
209
+ k (int): Kernel size.
210
+
211
+ Notes:
212
+ This module is equivalent to SPP(k=(5, 9, 13)).
178
213
  """
179
214
  super().__init__()
180
215
  c_ = c1 // 2 # hidden channels
@@ -183,7 +218,7 @@ class SPPF(nn.Module):
183
218
  self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
184
219
 
185
220
  def forward(self, x):
186
- """Forward pass through Ghost Convolution block."""
221
+ """Apply sequential pooling operations to input and return concatenated feature maps."""
187
222
  y = [self.cv1(x)]
188
223
  y.extend(self.m(y[-1]) for _ in range(3))
189
224
  return self.cv2(torch.cat(y, 1))
@@ -193,13 +228,20 @@ class C1(nn.Module):
193
228
  """CSP Bottleneck with 1 convolution."""
194
229
 
195
230
  def __init__(self, c1, c2, n=1):
196
- """Initializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number."""
231
+ """
232
+ Initialize the CSP Bottleneck with 1 convolution.
233
+
234
+ Args:
235
+ c1 (int): Input channels.
236
+ c2 (int): Output channels.
237
+ n (int): Number of convolutions.
238
+ """
197
239
  super().__init__()
198
240
  self.cv1 = Conv(c1, c2, 1, 1)
199
241
  self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
200
242
 
201
243
  def forward(self, x):
202
- """Applies cross-convolutions to input in the C3 module."""
244
+ """Apply convolution and residual connection to input tensor."""
203
245
  y = self.cv1(x)
204
246
  return self.m(y) + y
205
247
 
@@ -208,7 +250,17 @@ class C2(nn.Module):
208
250
  """CSP Bottleneck with 2 convolutions."""
209
251
 
210
252
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
211
- """Initializes a CSP Bottleneck with 2 convolutions and optional shortcut connection."""
253
+ """
254
+ Initialize a CSP Bottleneck with 2 convolutions.
255
+
256
+ Args:
257
+ c1 (int): Input channels.
258
+ c2 (int): Output channels.
259
+ n (int): Number of Bottleneck blocks.
260
+ shortcut (bool): Whether to use shortcut connections.
261
+ g (int): Groups for convolutions.
262
+ e (float): Expansion ratio.
263
+ """
212
264
  super().__init__()
213
265
  self.c = int(c2 * e) # hidden channels
214
266
  self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@@ -226,7 +278,17 @@ class C2f(nn.Module):
226
278
  """Faster Implementation of CSP Bottleneck with 2 convolutions."""
227
279
 
228
280
  def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
229
- """Initializes a CSP bottleneck with 2 convolutions and n Bottleneck blocks for faster processing."""
281
+ """
282
+ Initialize a CSP bottleneck with 2 convolutions.
283
+
284
+ Args:
285
+ c1 (int): Input channels.
286
+ c2 (int): Output channels.
287
+ n (int): Number of Bottleneck blocks.
288
+ shortcut (bool): Whether to use shortcut connections.
289
+ g (int): Groups for convolutions.
290
+ e (float): Expansion ratio.
291
+ """
230
292
  super().__init__()
231
293
  self.c = int(c2 * e) # hidden channels
232
294
  self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@@ -251,7 +313,17 @@ class C3(nn.Module):
251
313
  """CSP Bottleneck with 3 convolutions."""
252
314
 
253
315
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
254
- """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""
316
+ """
317
+ Initialize the CSP Bottleneck with 3 convolutions.
318
+
319
+ Args:
320
+ c1 (int): Input channels.
321
+ c2 (int): Output channels.
322
+ n (int): Number of Bottleneck blocks.
323
+ shortcut (bool): Whether to use shortcut connections.
324
+ g (int): Groups for convolutions.
325
+ e (float): Expansion ratio.
326
+ """
255
327
  super().__init__()
256
328
  c_ = int(c2 * e) # hidden channels
257
329
  self.cv1 = Conv(c1, c_, 1, 1)
@@ -260,7 +332,7 @@ class C3(nn.Module):
260
332
  self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
261
333
 
262
334
  def forward(self, x):
263
- """Forward pass through the CSP bottleneck with 2 convolutions."""
335
+ """Forward pass through the CSP bottleneck with 3 convolutions."""
264
336
  return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
265
337
 
266
338
 
@@ -268,7 +340,17 @@ class C3x(C3):
268
340
  """C3 module with cross-convolutions."""
269
341
 
270
342
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
271
- """Initialize C3TR instance and set default parameters."""
343
+ """
344
+ Initialize C3 module with cross-convolutions.
345
+
346
+ Args:
347
+ c1 (int): Input channels.
348
+ c2 (int): Output channels.
349
+ n (int): Number of Bottleneck blocks.
350
+ shortcut (bool): Whether to use shortcut connections.
351
+ g (int): Groups for convolutions.
352
+ e (float): Expansion ratio.
353
+ """
272
354
  super().__init__(c1, c2, n, shortcut, g, e)
273
355
  self.c_ = int(c2 * e)
274
356
  self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
@@ -278,7 +360,15 @@ class RepC3(nn.Module):
278
360
  """Rep C3."""
279
361
 
280
362
  def __init__(self, c1, c2, n=3, e=1.0):
281
- """Initialize CSP Bottleneck with a single convolution using input channels, output channels, and number."""
363
+ """
364
+ Initialize CSP Bottleneck with a single convolution.
365
+
366
+ Args:
367
+ c1 (int): Input channels.
368
+ c2 (int): Output channels.
369
+ n (int): Number of RepConv blocks.
370
+ e (float): Expansion ratio.
371
+ """
282
372
  super().__init__()
283
373
  c_ = int(c2 * e) # hidden channels
284
374
  self.cv1 = Conv(c1, c_, 1, 1)
@@ -287,7 +377,7 @@ class RepC3(nn.Module):
287
377
  self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
288
378
 
289
379
  def forward(self, x):
290
- """Forward pass of RT-DETR neck layer."""
380
+ """Forward pass of RepC3 module."""
291
381
  return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
292
382
 
293
383
 
@@ -295,7 +385,17 @@ class C3TR(C3):
295
385
  """C3 module with TransformerBlock()."""
296
386
 
297
387
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
298
- """Initialize C3Ghost module with GhostBottleneck()."""
388
+ """
389
+ Initialize C3 module with TransformerBlock.
390
+
391
+ Args:
392
+ c1 (int): Input channels.
393
+ c2 (int): Output channels.
394
+ n (int): Number of Transformer blocks.
395
+ shortcut (bool): Whether to use shortcut connections.
396
+ g (int): Groups for convolutions.
397
+ e (float): Expansion ratio.
398
+ """
299
399
  super().__init__(c1, c2, n, shortcut, g, e)
300
400
  c_ = int(c2 * e)
301
401
  self.m = TransformerBlock(c_, c_, 4, n)
@@ -305,7 +405,17 @@ class C3Ghost(C3):
305
405
  """C3 module with GhostBottleneck()."""
306
406
 
307
407
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
308
- """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
408
+ """
409
+ Initialize C3 module with GhostBottleneck.
410
+
411
+ Args:
412
+ c1 (int): Input channels.
413
+ c2 (int): Output channels.
414
+ n (int): Number of Ghost bottleneck blocks.
415
+ shortcut (bool): Whether to use shortcut connections.
416
+ g (int): Groups for convolutions.
417
+ e (float): Expansion ratio.
418
+ """
309
419
  super().__init__(c1, c2, n, shortcut, g, e)
310
420
  c_ = int(c2 * e) # hidden channels
311
421
  self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
@@ -315,7 +425,15 @@ class GhostBottleneck(nn.Module):
315
425
  """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
316
426
 
317
427
  def __init__(self, c1, c2, k=3, s=1):
318
- """Initializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride."""
428
+ """
429
+ Initialize Ghost Bottleneck module.
430
+
431
+ Args:
432
+ c1 (int): Input channels.
433
+ c2 (int): Output channels.
434
+ k (int): Kernel size.
435
+ s (int): Stride.
436
+ """
319
437
  super().__init__()
320
438
  c_ = c2 // 2
321
439
  self.conv = nn.Sequential(
@@ -328,7 +446,7 @@ class GhostBottleneck(nn.Module):
328
446
  )
329
447
 
330
448
  def forward(self, x):
331
- """Applies skip connection and concatenation to input tensor."""
449
+ """Apply skip connection and concatenation to input tensor."""
332
450
  return self.conv(x) + self.shortcut(x)
333
451
 
334
452
 
@@ -336,7 +454,17 @@ class Bottleneck(nn.Module):
336
454
  """Standard bottleneck."""
337
455
 
338
456
  def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
339
- """Initializes a standard bottleneck module with optional shortcut connection and configurable parameters."""
457
+ """
458
+ Initialize a standard bottleneck module.
459
+
460
+ Args:
461
+ c1 (int): Input channels.
462
+ c2 (int): Output channels.
463
+ shortcut (bool): Whether to use shortcut connection.
464
+ g (int): Groups for convolutions.
465
+ k (Tuple[int, int]): Kernel sizes for convolutions.
466
+ e (float): Expansion ratio.
467
+ """
340
468
  super().__init__()
341
469
  c_ = int(c2 * e) # hidden channels
342
470
  self.cv1 = Conv(c1, c_, k[0], 1)
@@ -344,7 +472,7 @@ class Bottleneck(nn.Module):
344
472
  self.add = shortcut and c1 == c2
345
473
 
346
474
  def forward(self, x):
347
- """Applies the YOLO FPN to input data."""
475
+ """Apply bottleneck with optional shortcut connection."""
348
476
  return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
349
477
 
350
478
 
@@ -352,7 +480,17 @@ class BottleneckCSP(nn.Module):
352
480
  """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
353
481
 
354
482
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
355
- """Initializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion."""
483
+ """
484
+ Initialize CSP Bottleneck.
485
+
486
+ Args:
487
+ c1 (int): Input channels.
488
+ c2 (int): Output channels.
489
+ n (int): Number of Bottleneck blocks.
490
+ shortcut (bool): Whether to use shortcut connections.
491
+ g (int): Groups for convolutions.
492
+ e (float): Expansion ratio.
493
+ """
356
494
  super().__init__()
357
495
  c_ = int(c2 * e) # hidden channels
358
496
  self.cv1 = Conv(c1, c_, 1, 1)
@@ -364,7 +502,7 @@ class BottleneckCSP(nn.Module):
364
502
  self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
365
503
 
366
504
  def forward(self, x):
367
- """Applies a CSP bottleneck with 3 convolutions."""
505
+ """Apply CSP bottleneck with 3 convolutions."""
368
506
  y1 = self.cv3(self.m(self.cv1(x)))
369
507
  y2 = self.cv2(x)
370
508
  return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
@@ -374,7 +512,15 @@ class ResNetBlock(nn.Module):
374
512
  """ResNet block with standard convolution layers."""
375
513
 
376
514
  def __init__(self, c1, c2, s=1, e=4):
377
- """Initialize convolution with given parameters."""
515
+ """
516
+ Initialize ResNet block.
517
+
518
+ Args:
519
+ c1 (int): Input channels.
520
+ c2 (int): Output channels.
521
+ s (int): Stride.
522
+ e (int): Expansion ratio.
523
+ """
378
524
  super().__init__()
379
525
  c3 = e * c2
380
526
  self.cv1 = Conv(c1, c2, k=1, s=1, act=True)
@@ -391,7 +537,17 @@ class ResNetLayer(nn.Module):
391
537
  """ResNet layer with multiple ResNet blocks."""
392
538
 
393
539
  def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4):
394
- """Initializes the ResNetLayer given arguments."""
540
+ """
541
+ Initialize ResNet layer.
542
+
543
+ Args:
544
+ c1 (int): Input channels.
545
+ c2 (int): Output channels.
546
+ s (int): Stride.
547
+ is_first (bool): Whether this is the first layer.
548
+ n (int): Number of ResNet blocks.
549
+ e (int): Expansion ratio.
550
+ """
395
551
  super().__init__()
396
552
  self.is_first = is_first
397
553
 
@@ -413,7 +569,17 @@ class MaxSigmoidAttnBlock(nn.Module):
413
569
  """Max Sigmoid attention block."""
414
570
 
415
571
  def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
416
- """Initializes MaxSigmoidAttnBlock with specified arguments."""
572
+ """
573
+ Initialize MaxSigmoidAttnBlock.
574
+
575
+ Args:
576
+ c1 (int): Input channels.
577
+ c2 (int): Output channels.
578
+ nh (int): Number of heads.
579
+ ec (int): Embedding channels.
580
+ gc (int): Guide channels.
581
+ scale (bool): Whether to use learnable scale parameter.
582
+ """
417
583
  super().__init__()
418
584
  self.nh = nh
419
585
  self.hc = c2 // nh
@@ -424,7 +590,16 @@ class MaxSigmoidAttnBlock(nn.Module):
424
590
  self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
425
591
 
426
592
  def forward(self, x, guide):
427
- """Forward process."""
593
+ """
594
+ Forward pass of MaxSigmoidAttnBlock.
595
+
596
+ Args:
597
+ x (torch.Tensor): Input tensor.
598
+ guide (torch.Tensor): Guide tensor.
599
+
600
+ Returns:
601
+ (torch.Tensor): Output tensor after attention.
602
+ """
428
603
  bs, _, h, w = x.shape
429
604
 
430
605
  guide = self.gl(guide)
@@ -448,7 +623,20 @@ class C2fAttn(nn.Module):
448
623
  """C2f module with an additional attn module."""
449
624
 
450
625
  def __init__(self, c1, c2, n=1, ec=128, nh=1, gc=512, shortcut=False, g=1, e=0.5):
451
- """Initializes C2f module with attention mechanism for enhanced feature extraction and processing."""
626
+ """
627
+ Initialize C2f module with attention mechanism.
628
+
629
+ Args:
630
+ c1 (int): Input channels.
631
+ c2 (int): Output channels.
632
+ n (int): Number of Bottleneck blocks.
633
+ ec (int): Embedding channels for attention.
634
+ nh (int): Number of heads for attention.
635
+ gc (int): Guide channels for attention.
636
+ shortcut (bool): Whether to use shortcut connections.
637
+ g (int): Groups for convolutions.
638
+ e (float): Expansion ratio.
639
+ """
452
640
  super().__init__()
453
641
  self.c = int(c2 * e) # hidden channels
454
642
  self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@@ -457,14 +645,32 @@ class C2fAttn(nn.Module):
457
645
  self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
458
646
 
459
647
  def forward(self, x, guide):
460
- """Forward pass through C2f layer."""
648
+ """
649
+ Forward pass through C2f layer with attention.
650
+
651
+ Args:
652
+ x (torch.Tensor): Input tensor.
653
+ guide (torch.Tensor): Guide tensor for attention.
654
+
655
+ Returns:
656
+ (torch.Tensor): Output tensor after processing.
657
+ """
461
658
  y = list(self.cv1(x).chunk(2, 1))
462
659
  y.extend(m(y[-1]) for m in self.m)
463
660
  y.append(self.attn(y[-1], guide))
464
661
  return self.cv2(torch.cat(y, 1))
465
662
 
466
663
  def forward_split(self, x, guide):
467
- """Forward pass using split() instead of chunk()."""
664
+ """
665
+ Forward pass using split() instead of chunk().
666
+
667
+ Args:
668
+ x (torch.Tensor): Input tensor.
669
+ guide (torch.Tensor): Guide tensor for attention.
670
+
671
+ Returns:
672
+ (torch.Tensor): Output tensor after processing.
673
+ """
468
674
  y = list(self.cv1(x).split((self.c, self.c), 1))
469
675
  y.extend(m(y[-1]) for m in self.m)
470
676
  y.append(self.attn(y[-1], guide))
@@ -475,7 +681,17 @@ class ImagePoolingAttn(nn.Module):
475
681
  """ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
476
682
 
477
683
  def __init__(self, ec=256, ch=(), ct=512, nh=8, k=3, scale=False):
478
- """Initializes ImagePoolingAttn with specified arguments."""
684
+ """
685
+ Initialize ImagePoolingAttn module.
686
+
687
+ Args:
688
+ ec (int): Embedding channels.
689
+ ch (Tuple): Channel dimensions for feature maps.
690
+ ct (int): Channel dimension for text embeddings.
691
+ nh (int): Number of attention heads.
692
+ k (int): Kernel size for pooling.
693
+ scale (bool): Whether to use learnable scale parameter.
694
+ """
479
695
  super().__init__()
480
696
 
481
697
  nf = len(ch)
@@ -493,7 +709,16 @@ class ImagePoolingAttn(nn.Module):
493
709
  self.k = k
494
710
 
495
711
  def forward(self, x, text):
496
- """Executes attention mechanism on input tensor x and guide tensor."""
712
+ """
713
+ Forward pass of ImagePoolingAttn.
714
+
715
+ Args:
716
+ x (List[torch.Tensor]): List of input feature maps.
717
+ text (torch.Tensor): Text embeddings.
718
+
719
+ Returns:
720
+ (torch.Tensor): Enhanced text embeddings.
721
+ """
497
722
  bs = x[0].shape[0]
498
723
  assert len(x) == self.nf
499
724
  num_patches = self.k**2
@@ -521,14 +746,23 @@ class ContrastiveHead(nn.Module):
521
746
  """Implements contrastive learning head for region-text similarity in vision-language models."""
522
747
 
523
748
  def __init__(self):
524
- """Initializes ContrastiveHead with specified region-text similarity parameters."""
749
+ """Initialize ContrastiveHead with region-text similarity parameters."""
525
750
  super().__init__()
526
751
  # NOTE: use -10.0 to keep the init cls loss consistency with other losses
527
752
  self.bias = nn.Parameter(torch.tensor([-10.0]))
528
753
  self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
529
754
 
530
755
  def forward(self, x, w):
531
- """Forward function of contrastive learning."""
756
+ """
757
+ Forward function of contrastive learning.
758
+
759
+ Args:
760
+ x (torch.Tensor): Image features.
761
+ w (torch.Tensor): Text features.
762
+
763
+ Returns:
764
+ (torch.Tensor): Similarity scores.
765
+ """
532
766
  x = F.normalize(x, dim=1, p=2)
533
767
  w = F.normalize(w, dim=-1, p=2)
534
768
  x = torch.einsum("bchw,bkc->bkhw", x, w)
@@ -544,7 +778,12 @@ class BNContrastiveHead(nn.Module):
544
778
  """
545
779
 
546
780
  def __init__(self, embed_dims: int):
547
- """Initialize ContrastiveHead with region-text similarity parameters."""
781
+ """
782
+ Initialize BNContrastiveHead.
783
+
784
+ Args:
785
+ embed_dims (int): Embedding dimensions for features.
786
+ """
548
787
  super().__init__()
549
788
  self.norm = nn.BatchNorm2d(embed_dims)
550
789
  # NOTE: use -10.0 to keep the init cls loss consistency with other losses
@@ -553,7 +792,16 @@ class BNContrastiveHead(nn.Module):
553
792
  self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
554
793
 
555
794
  def forward(self, x, w):
556
- """Forward function of contrastive learning."""
795
+ """
796
+ Forward function of contrastive learning with batch normalization.
797
+
798
+ Args:
799
+ x (torch.Tensor): Image features.
800
+ w (torch.Tensor): Text features.
801
+
802
+ Returns:
803
+ (torch.Tensor): Similarity scores.
804
+ """
557
805
  x = self.norm(x)
558
806
  w = F.normalize(w, dim=-1, p=2)
559
807
  x = torch.einsum("bchw,bkc->bkhw", x, w)
@@ -564,7 +812,17 @@ class RepBottleneck(Bottleneck):
564
812
  """Rep bottleneck."""
565
813
 
566
814
  def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
567
- """Initializes a RepBottleneck module with customizable in/out channels, shortcuts, groups and expansion."""
815
+ """
816
+ Initialize RepBottleneck.
817
+
818
+ Args:
819
+ c1 (int): Input channels.
820
+ c2 (int): Output channels.
821
+ shortcut (bool): Whether to use shortcut connection.
822
+ g (int): Groups for convolutions.
823
+ k (Tuple[int, int]): Kernel sizes for convolutions.
824
+ e (float): Expansion ratio.
825
+ """
568
826
  super().__init__(c1, c2, shortcut, g, k, e)
569
827
  c_ = int(c2 * e) # hidden channels
570
828
  self.cv1 = RepConv(c1, c_, k[0], 1)
@@ -574,7 +832,17 @@ class RepCSP(C3):
574
832
  """Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""
575
833
 
576
834
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
577
- """Initializes RepCSP layer with given channels, repetitions, shortcut, groups and expansion ratio."""
835
+ """
836
+ Initialize RepCSP layer.
837
+
838
+ Args:
839
+ c1 (int): Input channels.
840
+ c2 (int): Output channels.
841
+ n (int): Number of RepBottleneck blocks.
842
+ shortcut (bool): Whether to use shortcut connections.
843
+ g (int): Groups for convolutions.
844
+ e (float): Expansion ratio.
845
+ """
578
846
  super().__init__(c1, c2, n, shortcut, g, e)
579
847
  c_ = int(c2 * e) # hidden channels
580
848
  self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
@@ -584,7 +852,16 @@ class RepNCSPELAN4(nn.Module):
584
852
  """CSP-ELAN."""
585
853
 
586
854
  def __init__(self, c1, c2, c3, c4, n=1):
587
- """Initializes CSP-ELAN layer with specified channel sizes, repetitions, and convolutions."""
855
+ """
856
+ Initialize CSP-ELAN layer.
857
+
858
+ Args:
859
+ c1 (int): Input channels.
860
+ c2 (int): Output channels.
861
+ c3 (int): Intermediate channels.
862
+ c4 (int): Intermediate channels for RepCSP.
863
+ n (int): Number of RepCSP blocks.
864
+ """
588
865
  super().__init__()
589
866
  self.c = c3 // 2
590
867
  self.cv1 = Conv(c1, c3, 1, 1)
@@ -609,7 +886,15 @@ class ELAN1(RepNCSPELAN4):
609
886
  """ELAN1 module with 4 convolutions."""
610
887
 
611
888
  def __init__(self, c1, c2, c3, c4):
612
- """Initializes ELAN1 layer with specified channel sizes."""
889
+ """
890
+ Initialize ELAN1 layer.
891
+
892
+ Args:
893
+ c1 (int): Input channels.
894
+ c2 (int): Output channels.
895
+ c3 (int): Intermediate channels.
896
+ c4 (int): Intermediate channels for convolutions.
897
+ """
613
898
  super().__init__(c1, c2, c3, c4)
614
899
  self.c = c3 // 2
615
900
  self.cv1 = Conv(c1, c3, 1, 1)
@@ -622,7 +907,13 @@ class AConv(nn.Module):
622
907
  """AConv."""
623
908
 
624
909
  def __init__(self, c1, c2):
625
- """Initializes AConv module with convolution layers."""
910
+ """
911
+ Initialize AConv module.
912
+
913
+ Args:
914
+ c1 (int): Input channels.
915
+ c2 (int): Output channels.
916
+ """
626
917
  super().__init__()
627
918
  self.cv1 = Conv(c1, c2, 3, 2, 1)
628
919
 
@@ -636,7 +927,13 @@ class ADown(nn.Module):
636
927
  """ADown."""
637
928
 
638
929
  def __init__(self, c1, c2):
639
- """Initializes ADown module with convolution layers to downsample input from channels c1 to c2."""
930
+ """
931
+ Initialize ADown module.
932
+
933
+ Args:
934
+ c1 (int): Input channels.
935
+ c2 (int): Output channels.
936
+ """
640
937
  super().__init__()
641
938
  self.c = c2 // 2
642
939
  self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
@@ -656,7 +953,15 @@ class SPPELAN(nn.Module):
656
953
  """SPP-ELAN."""
657
954
 
658
955
  def __init__(self, c1, c2, c3, k=5):
659
- """Initializes SPP-ELAN block with convolution and max pooling layers for spatial pyramid pooling."""
956
+ """
957
+ Initialize SPP-ELAN block.
958
+
959
+ Args:
960
+ c1 (int): Input channels.
961
+ c2 (int): Output channels.
962
+ c3 (int): Intermediate channels.
963
+ k (int): Kernel size for max pooling.
964
+ """
660
965
  super().__init__()
661
966
  self.c = c3
662
967
  self.cv1 = Conv(c1, c3, 1, 1)
@@ -676,7 +981,17 @@ class CBLinear(nn.Module):
676
981
  """CBLinear."""
677
982
 
678
983
  def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):
679
- """Initializes the CBLinear module, passing inputs unchanged."""
984
+ """
985
+ Initialize CBLinear module.
986
+
987
+ Args:
988
+ c1 (int): Input channels.
989
+ c2s (List[int]): List of output channel sizes.
990
+ k (int): Kernel size.
991
+ s (int): Stride.
992
+ p (int | None): Padding.
993
+ g (int): Groups.
994
+ """
680
995
  super().__init__()
681
996
  self.c2s = c2s
682
997
  self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
@@ -690,12 +1005,25 @@ class CBFuse(nn.Module):
690
1005
  """CBFuse."""
691
1006
 
692
1007
  def __init__(self, idx):
693
- """Initializes CBFuse module with layer index for selective feature fusion."""
1008
+ """
1009
+ Initialize CBFuse module.
1010
+
1011
+ Args:
1012
+ idx (List[int]): Indices for feature selection.
1013
+ """
694
1014
  super().__init__()
695
1015
  self.idx = idx
696
1016
 
697
1017
  def forward(self, xs):
698
- """Forward pass through CBFuse layer."""
1018
+ """
1019
+ Forward pass through CBFuse layer.
1020
+
1021
+ Args:
1022
+ xs (List[torch.Tensor]): List of input tensors.
1023
+
1024
+ Returns:
1025
+ (torch.Tensor): Fused output tensor.
1026
+ """
699
1027
  target_size = xs[-1].shape[2:]
700
1028
  res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
701
1029
  return torch.sum(torch.stack(res + xs[-1:]), dim=0)
@@ -705,8 +1033,16 @@ class C3f(nn.Module):
705
1033
  """Faster Implementation of CSP Bottleneck with 2 convolutions."""
706
1034
 
707
1035
  def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
708
- """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
709
- expansion.
1036
+ """
1037
+ Initialize CSP bottleneck layer with two convolutions.
1038
+
1039
+ Args:
1040
+ c1 (int): Input channels.
1041
+ c2 (int): Output channels.
1042
+ n (int): Number of Bottleneck blocks.
1043
+ shortcut (bool): Whether to use shortcut connections.
1044
+ g (int): Groups for convolutions.
1045
+ e (float): Expansion ratio.
710
1046
  """
711
1047
  super().__init__()
712
1048
  c_ = int(c2 * e) # hidden channels
@@ -716,7 +1052,7 @@ class C3f(nn.Module):
716
1052
  self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
717
1053
 
718
1054
  def forward(self, x):
719
- """Forward pass through C2f layer."""
1055
+ """Forward pass through C3f layer."""
720
1056
  y = [self.cv2(x), self.cv1(x)]
721
1057
  y.extend(m(y[-1]) for m in self.m)
722
1058
  return self.cv3(torch.cat(y, 1))
@@ -726,7 +1062,18 @@ class C3k2(C2f):
726
1062
  """Faster Implementation of CSP Bottleneck with 2 convolutions."""
727
1063
 
728
1064
  def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
729
- """Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
1065
+ """
1066
+ Initialize C3k2 module.
1067
+
1068
+ Args:
1069
+ c1 (int): Input channels.
1070
+ c2 (int): Output channels.
1071
+ n (int): Number of blocks.
1072
+ c3k (bool): Whether to use C3k blocks.
1073
+ e (float): Expansion ratio.
1074
+ g (int): Groups for convolutions.
1075
+ shortcut (bool): Whether to use shortcut connections.
1076
+ """
730
1077
  super().__init__(c1, c2, n, shortcut, g, e)
731
1078
  self.m = nn.ModuleList(
732
1079
  C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
@@ -737,7 +1084,18 @@ class C3k(C3):
737
1084
  """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
738
1085
 
739
1086
  def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
740
- """Initializes the C3k module with specified channels, number of layers, and configurations."""
1087
+ """
1088
+ Initialize C3k module.
1089
+
1090
+ Args:
1091
+ c1 (int): Input channels.
1092
+ c2 (int): Output channels.
1093
+ n (int): Number of Bottleneck blocks.
1094
+ shortcut (bool): Whether to use shortcut connections.
1095
+ g (int): Groups for convolutions.
1096
+ e (float): Expansion ratio.
1097
+ k (int): Kernel size.
1098
+ """
741
1099
  super().__init__(c1, c2, n, shortcut, g, e)
742
1100
  c_ = int(c2 * e) # hidden channels
743
1101
  # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
@@ -748,7 +1106,12 @@ class RepVGGDW(torch.nn.Module):
748
1106
  """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
749
1107
 
750
1108
  def __init__(self, ed) -> None:
751
- """Initializes RepVGGDW with depthwise separable convolutional layers for efficient processing."""
1109
+ """
1110
+ Initialize RepVGGDW module.
1111
+
1112
+ Args:
1113
+ ed (int): Input and output channels.
1114
+ """
752
1115
  super().__init__()
753
1116
  self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
754
1117
  self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
@@ -757,7 +1120,7 @@ class RepVGGDW(torch.nn.Module):
757
1120
 
758
1121
  def forward(self, x):
759
1122
  """
760
- Performs a forward pass of the RepVGGDW block.
1123
+ Perform a forward pass of the RepVGGDW block.
761
1124
 
762
1125
  Args:
763
1126
  x (torch.Tensor): Input tensor.
@@ -769,7 +1132,7 @@ class RepVGGDW(torch.nn.Module):
769
1132
 
770
1133
  def forward_fuse(self, x):
771
1134
  """
772
- Performs a forward pass of the RepVGGDW block without fusing the convolutions.
1135
+ Perform a forward pass of the RepVGGDW block without fusing the convolutions.
773
1136
 
774
1137
  Args:
775
1138
  x (torch.Tensor): Input tensor.
@@ -782,7 +1145,7 @@ class RepVGGDW(torch.nn.Module):
782
1145
  @torch.no_grad()
783
1146
  def fuse(self):
784
1147
  """
785
- Fuses the convolutional layers in the RepVGGDW block.
1148
+ Fuse the convolutional layers in the RepVGGDW block.
786
1149
 
787
1150
  This method fuses the convolutional layers and updates the weights and biases accordingly.
788
1151
  """
@@ -819,7 +1182,16 @@ class CIB(nn.Module):
819
1182
  """
820
1183
 
821
1184
  def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
822
- """Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""
1185
+ """
1186
+ Initialize the CIB module.
1187
+
1188
+ Args:
1189
+ c1 (int): Input channels.
1190
+ c2 (int): Output channels.
1191
+ shortcut (bool): Whether to use shortcut connection.
1192
+ e (float): Expansion ratio.
1193
+ lk (bool): Whether to use RepVGGDW.
1194
+ """
823
1195
  super().__init__()
824
1196
  c_ = int(c2 * e) # hidden channels
825
1197
  self.cv1 = nn.Sequential(
@@ -860,7 +1232,18 @@ class C2fCIB(C2f):
860
1232
  """
861
1233
 
862
1234
  def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
863
- """Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
1235
+ """
1236
+ Initialize C2fCIB module.
1237
+
1238
+ Args:
1239
+ c1 (int): Input channels.
1240
+ c2 (int): Output channels.
1241
+ n (int): Number of CIB modules.
1242
+ shortcut (bool): Whether to use shortcut connection.
1243
+ lk (bool): Whether to use local key connection.
1244
+ g (int): Groups for convolutions.
1245
+ e (float): Expansion ratio.
1246
+ """
864
1247
  super().__init__(c1, c2, n, shortcut, g, e)
865
1248
  self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
866
1249
 
@@ -885,7 +1268,14 @@ class Attention(nn.Module):
885
1268
  """
886
1269
 
887
1270
  def __init__(self, dim, num_heads=8, attn_ratio=0.5):
888
- """Initializes multi-head attention module with query, key, and value convolutions and positional encoding."""
1271
+ """
1272
+ Initialize multi-head attention module.
1273
+
1274
+ Args:
1275
+ dim (int): Input dimension.
1276
+ num_heads (int): Number of attention heads.
1277
+ attn_ratio (float): Attention ratio for key dimension.
1278
+ """
889
1279
  super().__init__()
890
1280
  self.num_heads = num_heads
891
1281
  self.head_dim = dim // num_heads
@@ -944,7 +1334,15 @@ class PSABlock(nn.Module):
944
1334
  """
945
1335
 
946
1336
  def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
947
- """Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""
1337
+ """
1338
+ Initialize the PSABlock.
1339
+
1340
+ Args:
1341
+ c (int): Input and output channels.
1342
+ attn_ratio (float): Attention ratio for key dimension.
1343
+ num_heads (int): Number of attention heads.
1344
+ shortcut (bool): Whether to use shortcut connections.
1345
+ """
948
1346
  super().__init__()
949
1347
 
950
1348
  self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
@@ -952,7 +1350,15 @@ class PSABlock(nn.Module):
952
1350
  self.add = shortcut
953
1351
 
954
1352
  def forward(self, x):
955
- """Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
1353
+ """
1354
+ Execute a forward pass through PSABlock.
1355
+
1356
+ Args:
1357
+ x (torch.Tensor): Input tensor.
1358
+
1359
+ Returns:
1360
+ (torch.Tensor): Output tensor after attention and feed-forward processing.
1361
+ """
956
1362
  x = x + self.attn(x) if self.add else self.attn(x)
957
1363
  x = x + self.ffn(x) if self.add else self.ffn(x)
958
1364
  return x
@@ -983,7 +1389,14 @@ class PSA(nn.Module):
983
1389
  """
984
1390
 
985
1391
  def __init__(self, c1, c2, e=0.5):
986
- """Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
1392
+ """
1393
+ Initialize PSA module.
1394
+
1395
+ Args:
1396
+ c1 (int): Input channels.
1397
+ c2 (int): Output channels.
1398
+ e (float): Expansion ratio.
1399
+ """
987
1400
  super().__init__()
988
1401
  assert c1 == c2
989
1402
  self.c = int(c1 * e)
@@ -994,7 +1407,15 @@ class PSA(nn.Module):
994
1407
  self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
995
1408
 
996
1409
  def forward(self, x):
997
- """Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor."""
1410
+ """
1411
+ Execute forward pass in PSA module.
1412
+
1413
+ Args:
1414
+ x (torch.Tensor): Input tensor.
1415
+
1416
+ Returns:
1417
+ (torch.Tensor): Output tensor after attention and feed-forward processing.
1418
+ """
998
1419
  a, b = self.cv1(x).split((self.c, self.c), dim=1)
999
1420
  b = b + self.attn(b)
1000
1421
  b = b + self.ffn(b)
@@ -1027,7 +1448,15 @@ class C2PSA(nn.Module):
1027
1448
  """
1028
1449
 
1029
1450
  def __init__(self, c1, c2, n=1, e=0.5):
1030
- """Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
1451
+ """
1452
+ Initialize C2PSA module.
1453
+
1454
+ Args:
1455
+ c1 (int): Input channels.
1456
+ c2 (int): Output channels.
1457
+ n (int): Number of PSABlock modules.
1458
+ e (float): Expansion ratio.
1459
+ """
1031
1460
  super().__init__()
1032
1461
  assert c1 == c2
1033
1462
  self.c = int(c1 * e)
@@ -1037,7 +1466,15 @@ class C2PSA(nn.Module):
1037
1466
  self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
1038
1467
 
1039
1468
  def forward(self, x):
1040
- """Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
1469
+ """
1470
+ Process the input tensor through a series of PSA blocks.
1471
+
1472
+ Args:
1473
+ x (torch.Tensor): Input tensor.
1474
+
1475
+ Returns:
1476
+ (torch.Tensor): Output tensor after processing.
1477
+ """
1041
1478
  a, b = self.cv1(x).split((self.c, self.c), dim=1)
1042
1479
  b = self.m(b)
1043
1480
  return self.cv2(torch.cat((a, b), 1))
@@ -1069,7 +1506,15 @@ class C2fPSA(C2f):
1069
1506
  """
1070
1507
 
1071
1508
  def __init__(self, c1, c2, n=1, e=0.5):
1072
- """Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
1509
+ """
1510
+ Initialize C2fPSA module.
1511
+
1512
+ Args:
1513
+ c1 (int): Input channels.
1514
+ c2 (int): Output channels.
1515
+ n (int): Number of PSABlock modules.
1516
+ e (float): Expansion ratio.
1517
+ """
1073
1518
  assert c1 == c2
1074
1519
  super().__init__(c1, c2, n=n, e=e)
1075
1520
  self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
@@ -1100,13 +1545,29 @@ class SCDown(nn.Module):
1100
1545
  """
1101
1546
 
1102
1547
  def __init__(self, c1, c2, k, s):
1103
- """Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
1548
+ """
1549
+ Initialize SCDown module.
1550
+
1551
+ Args:
1552
+ c1 (int): Input channels.
1553
+ c2 (int): Output channels.
1554
+ k (int): Kernel size.
1555
+ s (int): Stride.
1556
+ """
1104
1557
  super().__init__()
1105
1558
  self.cv1 = Conv(c1, c2, 1, 1)
1106
1559
  self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
1107
1560
 
1108
1561
  def forward(self, x):
1109
- """Applies convolution and downsampling to the input tensor in the SCDown module."""
1562
+ """
1563
+ Apply convolution and downsampling to the input tensor.
1564
+
1565
+ Args:
1566
+ x (torch.Tensor): Input tensor.
1567
+
1568
+ Returns:
1569
+ (torch.Tensor): Downsampled output tensor.
1570
+ """
1110
1571
  return self.cv2(self.cv1(x))
1111
1572
 
1112
1573
 
@@ -1128,7 +1589,16 @@ class TorchVision(nn.Module):
1128
1589
  """
1129
1590
 
1130
1591
  def __init__(self, model, weights="DEFAULT", unwrap=True, truncate=2, split=False):
1131
- """Load the model and weights from torchvision."""
1592
+ """
1593
+ Load the model and weights from torchvision.
1594
+
1595
+ Args:
1596
+ model (str): Name of the torchvision model to load.
1597
+ weights (str): Pre-trained weights to load.
1598
+ unwrap (bool): Whether to unwrap the model.
1599
+ truncate (int): Number of layers to truncate.
1600
+ split (bool): Whether to split the output.
1601
+ """
1132
1602
  import torchvision # scope for faster 'import ultralytics'
1133
1603
 
1134
1604
  super().__init__()
@@ -1147,7 +1617,15 @@ class TorchVision(nn.Module):
1147
1617
  self.m.head = self.m.heads = nn.Identity()
1148
1618
 
1149
1619
  def forward(self, x):
1150
- """Forward pass through the model."""
1620
+ """
1621
+ Forward pass through the model.
1622
+
1623
+ Args:
1624
+ x (torch.Tensor): Input tensor.
1625
+
1626
+ Returns:
1627
+ (torch.Tensor | List[torch.Tensor]): Output tensor or list of tensors.
1628
+ """
1151
1629
  if self.split:
1152
1630
  y = [x]
1153
1631
  y.extend(m(y[-1]) for m in self.m)
@@ -1184,7 +1662,7 @@ class AAttn(nn.Module):
1184
1662
 
1185
1663
  def __init__(self, dim, num_heads, area=1):
1186
1664
  """
1187
- Initializes an Area-attention module for YOLO models.
1665
+ Initialize an Area-attention module for YOLO models.
1188
1666
 
1189
1667
  Args:
1190
1668
  dim (int): Number of hidden channels.
@@ -1203,7 +1681,15 @@ class AAttn(nn.Module):
1203
1681
  self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
1204
1682
 
1205
1683
  def forward(self, x):
1206
- """Processes the input tensor 'x' through the area-attention."""
1684
+ """
1685
+ Process the input tensor through the area-attention.
1686
+
1687
+ Args:
1688
+ x (torch.Tensor): Input tensor.
1689
+
1690
+ Returns:
1691
+ (torch.Tensor): Output tensor after area-attention.
1692
+ """
1207
1693
  B, C, H, W = x.shape
1208
1694
  N = H * W
1209
1695
 
@@ -1260,11 +1746,7 @@ class ABlock(nn.Module):
1260
1746
 
1261
1747
  def __init__(self, dim, num_heads, mlp_ratio=1.2, area=1):
1262
1748
  """
1263
- Initializes an Area-attention block module for efficient feature extraction in YOLO models.
1264
-
1265
- This module implements an area-attention mechanism combined with a feed-forward network for processing feature
1266
- maps. It uses a novel area-based attention approach that is more efficient than traditional self-attention
1267
- while maintaining effectiveness.
1749
+ Initialize an Area-attention block module.
1268
1750
 
1269
1751
  Args:
1270
1752
  dim (int): Number of input channels.
@@ -1281,14 +1763,27 @@ class ABlock(nn.Module):
1281
1763
  self.apply(self._init_weights)
1282
1764
 
1283
1765
  def _init_weights(self, m):
1284
- """Initialize weights using a truncated normal distribution."""
1766
+ """
1767
+ Initialize weights using a truncated normal distribution.
1768
+
1769
+ Args:
1770
+ m (nn.Module): Module to initialize.
1771
+ """
1285
1772
  if isinstance(m, nn.Conv2d):
1286
1773
  nn.init.trunc_normal_(m.weight, std=0.02)
1287
1774
  if m.bias is not None:
1288
1775
  nn.init.constant_(m.bias, 0)
1289
1776
 
1290
1777
  def forward(self, x):
1291
- """Forward pass through ABlock, applying area-attention and feed-forward layers to the input tensor."""
1778
+ """
1779
+ Forward pass through ABlock.
1780
+
1781
+ Args:
1782
+ x (torch.Tensor): Input tensor.
1783
+
1784
+ Returns:
1785
+ (torch.Tensor): Output tensor after area-attention and feed-forward processing.
1786
+ """
1292
1787
  x = x + self.attn(x)
1293
1788
  return x + self.mlp(x)
1294
1789
 
@@ -1319,7 +1814,7 @@ class A2C2f(nn.Module):
1319
1814
 
1320
1815
  def __init__(self, c1, c2, n=1, a2=True, area=1, residual=False, mlp_ratio=2.0, e=0.5, g=1, shortcut=True):
1321
1816
  """
1322
- Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
1817
+ Initialize Area-Attention C2f module.
1323
1818
 
1324
1819
  Args:
1325
1820
  c1 (int): Number of input channels.
@@ -1349,7 +1844,15 @@ class A2C2f(nn.Module):
1349
1844
  )
1350
1845
 
1351
1846
  def forward(self, x):
1352
- """Forward pass through R-ELAN layer."""
1847
+ """
1848
+ Forward pass through A2C2f layer.
1849
+
1850
+ Args:
1851
+ x (torch.Tensor): Input tensor.
1852
+
1853
+ Returns:
1854
+ (torch.Tensor): Output tensor after processing.
1855
+ """
1353
1856
  y = [self.cv1(x)]
1354
1857
  y.extend(m(y[-1]) for m in self.m)
1355
1858
  y = self.cv2(torch.cat(y, 1))