ultralytics 8.3.89__py3-none-any.whl → 8.3.91__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/conftest.py +2 -2
- tests/test_cli.py +13 -11
- tests/test_cuda.py +10 -1
- tests/test_exports.py +2 -2
- tests/test_integrations.py +1 -5
- tests/test_python.py +16 -16
- tests/test_solutions.py +9 -9
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +3 -1
- ultralytics/cfg/models/11/yolo11-cls.yaml +5 -5
- ultralytics/cfg/models/11/yolo11-obb.yaml +5 -5
- ultralytics/cfg/models/11/yolo11-pose.yaml +5 -5
- ultralytics/cfg/models/11/yolo11-seg.yaml +5 -5
- ultralytics/cfg/models/11/yolo11.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-ghost.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-obb.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-world.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8-worldv2.yaml +5 -5
- ultralytics/cfg/models/v8/yolov8.yaml +5 -5
- ultralytics/cfg/models/v9/yolov9c-seg.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9c.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9e-seg.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9e.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9m.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
- ultralytics/cfg/models/v9/yolov9t.yaml +1 -1
- ultralytics/data/annotator.py +9 -14
- ultralytics/data/base.py +118 -30
- ultralytics/data/build.py +63 -24
- ultralytics/data/converter.py +5 -5
- ultralytics/data/dataset.py +207 -53
- ultralytics/data/loaders.py +1 -0
- ultralytics/data/split_dota.py +39 -12
- ultralytics/data/utils.py +15 -19
- ultralytics/engine/exporter.py +24 -23
- ultralytics/engine/model.py +67 -88
- ultralytics/engine/predictor.py +106 -21
- ultralytics/engine/trainer.py +32 -23
- ultralytics/engine/tuner.py +21 -18
- ultralytics/engine/validator.py +75 -41
- ultralytics/hub/__init__.py +12 -13
- ultralytics/hub/auth.py +9 -12
- ultralytics/hub/session.py +76 -21
- ultralytics/hub/utils.py +19 -17
- ultralytics/models/fastsam/model.py +20 -11
- ultralytics/models/fastsam/predict.py +36 -16
- ultralytics/models/fastsam/utils.py +5 -5
- ultralytics/models/fastsam/val.py +6 -6
- ultralytics/models/nas/model.py +22 -11
- ultralytics/models/nas/predict.py +9 -4
- ultralytics/models/nas/val.py +5 -5
- ultralytics/models/rtdetr/model.py +20 -11
- ultralytics/models/rtdetr/predict.py +18 -15
- ultralytics/models/rtdetr/train.py +20 -16
- ultralytics/models/rtdetr/val.py +42 -6
- ultralytics/models/sam/__init__.py +1 -1
- ultralytics/models/sam/amg.py +50 -4
- ultralytics/models/sam/model.py +8 -14
- ultralytics/models/sam/modules/decoders.py +18 -21
- ultralytics/models/sam/modules/encoders.py +25 -46
- ultralytics/models/sam/modules/memory_attention.py +19 -15
- ultralytics/models/sam/modules/sam.py +18 -25
- ultralytics/models/sam/modules/tiny_encoder.py +19 -29
- ultralytics/models/sam/modules/transformer.py +35 -57
- ultralytics/models/sam/modules/utils.py +15 -15
- ultralytics/models/sam/predict.py +0 -3
- ultralytics/models/utils/loss.py +87 -36
- ultralytics/models/utils/ops.py +26 -31
- ultralytics/models/yolo/classify/predict.py +24 -3
- ultralytics/models/yolo/classify/train.py +77 -10
- ultralytics/models/yolo/classify/val.py +40 -15
- ultralytics/models/yolo/detect/predict.py +23 -10
- ultralytics/models/yolo/detect/train.py +85 -15
- ultralytics/models/yolo/detect/val.py +145 -21
- ultralytics/models/yolo/model.py +1 -2
- ultralytics/models/yolo/obb/predict.py +12 -4
- ultralytics/models/yolo/obb/train.py +7 -0
- ultralytics/models/yolo/obb/val.py +25 -7
- ultralytics/models/yolo/pose/predict.py +22 -6
- ultralytics/models/yolo/pose/train.py +17 -1
- ultralytics/models/yolo/pose/val.py +46 -21
- ultralytics/models/yolo/segment/predict.py +22 -8
- ultralytics/models/yolo/segment/train.py +6 -0
- ultralytics/models/yolo/segment/val.py +100 -14
- ultralytics/models/yolo/world/train.py +38 -8
- ultralytics/models/yolo/world/train_world.py +39 -10
- ultralytics/nn/autobackend.py +28 -14
- ultralytics/nn/modules/__init__.py +3 -0
- ultralytics/nn/modules/activation.py +12 -3
- ultralytics/nn/modules/block.py +587 -84
- ultralytics/nn/modules/conv.py +418 -54
- ultralytics/nn/modules/head.py +3 -4
- ultralytics/nn/modules/transformer.py +320 -34
- ultralytics/nn/modules/utils.py +17 -3
- ultralytics/nn/tasks.py +221 -69
- ultralytics/solutions/ai_gym.py +2 -2
- ultralytics/solutions/analytics.py +4 -4
- ultralytics/solutions/heatmap.py +4 -4
- ultralytics/solutions/instance_segmentation.py +10 -4
- ultralytics/solutions/object_blurrer.py +2 -2
- ultralytics/solutions/object_counter.py +2 -2
- ultralytics/solutions/object_cropper.py +2 -2
- ultralytics/solutions/parking_management.py +9 -9
- ultralytics/solutions/queue_management.py +1 -1
- ultralytics/solutions/region_counter.py +2 -2
- ultralytics/solutions/security_alarm.py +7 -7
- ultralytics/solutions/solutions.py +7 -4
- ultralytics/solutions/speed_estimation.py +2 -2
- ultralytics/solutions/streamlit_inference.py +6 -6
- ultralytics/solutions/trackzone.py +9 -2
- ultralytics/solutions/vision_eye.py +4 -4
- ultralytics/trackers/basetrack.py +1 -1
- ultralytics/trackers/bot_sort.py +23 -22
- ultralytics/trackers/byte_tracker.py +4 -4
- ultralytics/trackers/track.py +2 -1
- ultralytics/trackers/utils/gmc.py +26 -27
- ultralytics/trackers/utils/kalman_filter.py +31 -29
- ultralytics/trackers/utils/matching.py +7 -7
- ultralytics/utils/__init__.py +32 -27
- ultralytics/utils/autobatch.py +5 -5
- ultralytics/utils/benchmarks.py +111 -18
- ultralytics/utils/callbacks/base.py +3 -3
- ultralytics/utils/callbacks/clearml.py +11 -11
- ultralytics/utils/callbacks/comet.py +42 -24
- ultralytics/utils/callbacks/dvc.py +11 -10
- ultralytics/utils/callbacks/hub.py +8 -8
- ultralytics/utils/callbacks/mlflow.py +1 -1
- ultralytics/utils/callbacks/neptune.py +12 -10
- ultralytics/utils/callbacks/raytune.py +1 -1
- ultralytics/utils/callbacks/tensorboard.py +6 -6
- ultralytics/utils/callbacks/wb.py +16 -16
- ultralytics/utils/checks.py +116 -35
- ultralytics/utils/dist.py +15 -2
- ultralytics/utils/downloads.py +13 -9
- ultralytics/utils/files.py +12 -13
- ultralytics/utils/instance.py +112 -45
- ultralytics/utils/loss.py +28 -33
- ultralytics/utils/metrics.py +246 -181
- ultralytics/utils/ops.py +61 -53
- ultralytics/utils/patches.py +8 -6
- ultralytics/utils/plotting.py +65 -45
- ultralytics/utils/tal.py +88 -57
- ultralytics/utils/torch_utils.py +181 -33
- ultralytics/utils/triton.py +13 -3
- ultralytics/utils/tuner.py +8 -16
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/METADATA +1 -1
- ultralytics-8.3.91.dist-info/RECORD +250 -0
- ultralytics-8.3.89.dist-info/RECORD +0 -250
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/LICENSE +0 -0
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/top_level.txt +0 -0
ultralytics/nn/modules/block.py
CHANGED
@@ -69,7 +69,7 @@ class DFL(nn.Module):
|
|
69
69
|
self.c1 = c1
|
70
70
|
|
71
71
|
def forward(self, x):
|
72
|
-
"""
|
72
|
+
"""Apply the DFL module to input tensor and return transformed output."""
|
73
73
|
b, _, a = x.shape # batch, channels, anchors
|
74
74
|
return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
|
75
75
|
# return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
|
@@ -80,9 +80,12 @@ class Proto(nn.Module):
|
|
80
80
|
|
81
81
|
def __init__(self, c1, c_=256, c2=32):
|
82
82
|
"""
|
83
|
-
|
83
|
+
Initialize the YOLOv8 mask Proto module with specified number of protos and masks.
|
84
84
|
|
85
|
-
|
85
|
+
Args:
|
86
|
+
c1 (int): Input channels.
|
87
|
+
c_ (int): Intermediate channels.
|
88
|
+
c2 (int): Output channels (number of protos).
|
86
89
|
"""
|
87
90
|
super().__init__()
|
88
91
|
self.cv1 = Conv(c1, c_, k=3)
|
@@ -91,7 +94,7 @@ class Proto(nn.Module):
|
|
91
94
|
self.cv3 = Conv(c_, c2)
|
92
95
|
|
93
96
|
def forward(self, x):
|
94
|
-
"""
|
97
|
+
"""Perform a forward pass through layers using an upsampled input image."""
|
95
98
|
return self.cv3(self.cv2(self.upsample(self.cv1(x))))
|
96
99
|
|
97
100
|
|
@@ -103,7 +106,14 @@ class HGStem(nn.Module):
|
|
103
106
|
"""
|
104
107
|
|
105
108
|
def __init__(self, c1, cm, c2):
|
106
|
-
"""
|
109
|
+
"""
|
110
|
+
Initialize the StemBlock of PPHGNetV2.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
c1 (int): Input channels.
|
114
|
+
cm (int): Middle channels.
|
115
|
+
c2 (int): Output channels.
|
116
|
+
"""
|
107
117
|
super().__init__()
|
108
118
|
self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
|
109
119
|
self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
|
@@ -134,7 +144,19 @@ class HGBlock(nn.Module):
|
|
134
144
|
"""
|
135
145
|
|
136
146
|
def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
|
137
|
-
"""
|
147
|
+
"""
|
148
|
+
Initialize HGBlock with specified parameters.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
c1 (int): Input channels.
|
152
|
+
cm (int): Middle channels.
|
153
|
+
c2 (int): Output channels.
|
154
|
+
k (int): Kernel size.
|
155
|
+
n (int): Number of LightConv or Conv blocks.
|
156
|
+
lightconv (bool): Whether to use LightConv.
|
157
|
+
shortcut (bool): Whether to use shortcut connection.
|
158
|
+
act (nn.Module): Activation function.
|
159
|
+
"""
|
138
160
|
super().__init__()
|
139
161
|
block = LightConv if lightconv else Conv
|
140
162
|
self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
|
@@ -154,7 +176,14 @@ class SPP(nn.Module):
|
|
154
176
|
"""Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
|
155
177
|
|
156
178
|
def __init__(self, c1, c2, k=(5, 9, 13)):
|
157
|
-
"""
|
179
|
+
"""
|
180
|
+
Initialize the SPP layer with input/output channels and pooling kernel sizes.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
c1 (int): Input channels.
|
184
|
+
c2 (int): Output channels.
|
185
|
+
k (Tuple[int, int, int]): Kernel sizes for max pooling.
|
186
|
+
"""
|
158
187
|
super().__init__()
|
159
188
|
c_ = c1 // 2 # hidden channels
|
160
189
|
self.cv1 = Conv(c1, c_, 1, 1)
|
@@ -172,9 +201,15 @@ class SPPF(nn.Module):
|
|
172
201
|
|
173
202
|
def __init__(self, c1, c2, k=5):
|
174
203
|
"""
|
175
|
-
|
204
|
+
Initialize the SPPF layer with given input/output channels and kernel size.
|
176
205
|
|
177
|
-
|
206
|
+
Args:
|
207
|
+
c1 (int): Input channels.
|
208
|
+
c2 (int): Output channels.
|
209
|
+
k (int): Kernel size.
|
210
|
+
|
211
|
+
Notes:
|
212
|
+
This module is equivalent to SPP(k=(5, 9, 13)).
|
178
213
|
"""
|
179
214
|
super().__init__()
|
180
215
|
c_ = c1 // 2 # hidden channels
|
@@ -183,7 +218,7 @@ class SPPF(nn.Module):
|
|
183
218
|
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
184
219
|
|
185
220
|
def forward(self, x):
|
186
|
-
"""
|
221
|
+
"""Apply sequential pooling operations to input and return concatenated feature maps."""
|
187
222
|
y = [self.cv1(x)]
|
188
223
|
y.extend(self.m(y[-1]) for _ in range(3))
|
189
224
|
return self.cv2(torch.cat(y, 1))
|
@@ -193,13 +228,20 @@ class C1(nn.Module):
|
|
193
228
|
"""CSP Bottleneck with 1 convolution."""
|
194
229
|
|
195
230
|
def __init__(self, c1, c2, n=1):
|
196
|
-
"""
|
231
|
+
"""
|
232
|
+
Initialize the CSP Bottleneck with 1 convolution.
|
233
|
+
|
234
|
+
Args:
|
235
|
+
c1 (int): Input channels.
|
236
|
+
c2 (int): Output channels.
|
237
|
+
n (int): Number of convolutions.
|
238
|
+
"""
|
197
239
|
super().__init__()
|
198
240
|
self.cv1 = Conv(c1, c2, 1, 1)
|
199
241
|
self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
|
200
242
|
|
201
243
|
def forward(self, x):
|
202
|
-
"""
|
244
|
+
"""Apply convolution and residual connection to input tensor."""
|
203
245
|
y = self.cv1(x)
|
204
246
|
return self.m(y) + y
|
205
247
|
|
@@ -208,7 +250,17 @@ class C2(nn.Module):
|
|
208
250
|
"""CSP Bottleneck with 2 convolutions."""
|
209
251
|
|
210
252
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
211
|
-
"""
|
253
|
+
"""
|
254
|
+
Initialize a CSP Bottleneck with 2 convolutions.
|
255
|
+
|
256
|
+
Args:
|
257
|
+
c1 (int): Input channels.
|
258
|
+
c2 (int): Output channels.
|
259
|
+
n (int): Number of Bottleneck blocks.
|
260
|
+
shortcut (bool): Whether to use shortcut connections.
|
261
|
+
g (int): Groups for convolutions.
|
262
|
+
e (float): Expansion ratio.
|
263
|
+
"""
|
212
264
|
super().__init__()
|
213
265
|
self.c = int(c2 * e) # hidden channels
|
214
266
|
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
|
@@ -226,7 +278,17 @@ class C2f(nn.Module):
|
|
226
278
|
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
227
279
|
|
228
280
|
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
|
229
|
-
"""
|
281
|
+
"""
|
282
|
+
Initialize a CSP bottleneck with 2 convolutions.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
c1 (int): Input channels.
|
286
|
+
c2 (int): Output channels.
|
287
|
+
n (int): Number of Bottleneck blocks.
|
288
|
+
shortcut (bool): Whether to use shortcut connections.
|
289
|
+
g (int): Groups for convolutions.
|
290
|
+
e (float): Expansion ratio.
|
291
|
+
"""
|
230
292
|
super().__init__()
|
231
293
|
self.c = int(c2 * e) # hidden channels
|
232
294
|
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
|
@@ -251,7 +313,17 @@ class C3(nn.Module):
|
|
251
313
|
"""CSP Bottleneck with 3 convolutions."""
|
252
314
|
|
253
315
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
254
|
-
"""
|
316
|
+
"""
|
317
|
+
Initialize the CSP Bottleneck with 3 convolutions.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
c1 (int): Input channels.
|
321
|
+
c2 (int): Output channels.
|
322
|
+
n (int): Number of Bottleneck blocks.
|
323
|
+
shortcut (bool): Whether to use shortcut connections.
|
324
|
+
g (int): Groups for convolutions.
|
325
|
+
e (float): Expansion ratio.
|
326
|
+
"""
|
255
327
|
super().__init__()
|
256
328
|
c_ = int(c2 * e) # hidden channels
|
257
329
|
self.cv1 = Conv(c1, c_, 1, 1)
|
@@ -260,7 +332,7 @@ class C3(nn.Module):
|
|
260
332
|
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
|
261
333
|
|
262
334
|
def forward(self, x):
|
263
|
-
"""Forward pass through the CSP bottleneck with
|
335
|
+
"""Forward pass through the CSP bottleneck with 3 convolutions."""
|
264
336
|
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
|
265
337
|
|
266
338
|
|
@@ -268,7 +340,17 @@ class C3x(C3):
|
|
268
340
|
"""C3 module with cross-convolutions."""
|
269
341
|
|
270
342
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
271
|
-
"""
|
343
|
+
"""
|
344
|
+
Initialize C3 module with cross-convolutions.
|
345
|
+
|
346
|
+
Args:
|
347
|
+
c1 (int): Input channels.
|
348
|
+
c2 (int): Output channels.
|
349
|
+
n (int): Number of Bottleneck blocks.
|
350
|
+
shortcut (bool): Whether to use shortcut connections.
|
351
|
+
g (int): Groups for convolutions.
|
352
|
+
e (float): Expansion ratio.
|
353
|
+
"""
|
272
354
|
super().__init__(c1, c2, n, shortcut, g, e)
|
273
355
|
self.c_ = int(c2 * e)
|
274
356
|
self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
|
@@ -278,7 +360,15 @@ class RepC3(nn.Module):
|
|
278
360
|
"""Rep C3."""
|
279
361
|
|
280
362
|
def __init__(self, c1, c2, n=3, e=1.0):
|
281
|
-
"""
|
363
|
+
"""
|
364
|
+
Initialize CSP Bottleneck with a single convolution.
|
365
|
+
|
366
|
+
Args:
|
367
|
+
c1 (int): Input channels.
|
368
|
+
c2 (int): Output channels.
|
369
|
+
n (int): Number of RepConv blocks.
|
370
|
+
e (float): Expansion ratio.
|
371
|
+
"""
|
282
372
|
super().__init__()
|
283
373
|
c_ = int(c2 * e) # hidden channels
|
284
374
|
self.cv1 = Conv(c1, c_, 1, 1)
|
@@ -287,7 +377,7 @@ class RepC3(nn.Module):
|
|
287
377
|
self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
|
288
378
|
|
289
379
|
def forward(self, x):
|
290
|
-
"""Forward pass of
|
380
|
+
"""Forward pass of RepC3 module."""
|
291
381
|
return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
|
292
382
|
|
293
383
|
|
@@ -295,7 +385,17 @@ class C3TR(C3):
|
|
295
385
|
"""C3 module with TransformerBlock()."""
|
296
386
|
|
297
387
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
298
|
-
"""
|
388
|
+
"""
|
389
|
+
Initialize C3 module with TransformerBlock.
|
390
|
+
|
391
|
+
Args:
|
392
|
+
c1 (int): Input channels.
|
393
|
+
c2 (int): Output channels.
|
394
|
+
n (int): Number of Transformer blocks.
|
395
|
+
shortcut (bool): Whether to use shortcut connections.
|
396
|
+
g (int): Groups for convolutions.
|
397
|
+
e (float): Expansion ratio.
|
398
|
+
"""
|
299
399
|
super().__init__(c1, c2, n, shortcut, g, e)
|
300
400
|
c_ = int(c2 * e)
|
301
401
|
self.m = TransformerBlock(c_, c_, 4, n)
|
@@ -305,7 +405,17 @@ class C3Ghost(C3):
|
|
305
405
|
"""C3 module with GhostBottleneck()."""
|
306
406
|
|
307
407
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
308
|
-
"""
|
408
|
+
"""
|
409
|
+
Initialize C3 module with GhostBottleneck.
|
410
|
+
|
411
|
+
Args:
|
412
|
+
c1 (int): Input channels.
|
413
|
+
c2 (int): Output channels.
|
414
|
+
n (int): Number of Ghost bottleneck blocks.
|
415
|
+
shortcut (bool): Whether to use shortcut connections.
|
416
|
+
g (int): Groups for convolutions.
|
417
|
+
e (float): Expansion ratio.
|
418
|
+
"""
|
309
419
|
super().__init__(c1, c2, n, shortcut, g, e)
|
310
420
|
c_ = int(c2 * e) # hidden channels
|
311
421
|
self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
|
@@ -315,7 +425,15 @@ class GhostBottleneck(nn.Module):
|
|
315
425
|
"""Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
|
316
426
|
|
317
427
|
def __init__(self, c1, c2, k=3, s=1):
|
318
|
-
"""
|
428
|
+
"""
|
429
|
+
Initialize Ghost Bottleneck module.
|
430
|
+
|
431
|
+
Args:
|
432
|
+
c1 (int): Input channels.
|
433
|
+
c2 (int): Output channels.
|
434
|
+
k (int): Kernel size.
|
435
|
+
s (int): Stride.
|
436
|
+
"""
|
319
437
|
super().__init__()
|
320
438
|
c_ = c2 // 2
|
321
439
|
self.conv = nn.Sequential(
|
@@ -328,7 +446,7 @@ class GhostBottleneck(nn.Module):
|
|
328
446
|
)
|
329
447
|
|
330
448
|
def forward(self, x):
|
331
|
-
"""
|
449
|
+
"""Apply skip connection and concatenation to input tensor."""
|
332
450
|
return self.conv(x) + self.shortcut(x)
|
333
451
|
|
334
452
|
|
@@ -336,7 +454,17 @@ class Bottleneck(nn.Module):
|
|
336
454
|
"""Standard bottleneck."""
|
337
455
|
|
338
456
|
def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
|
339
|
-
"""
|
457
|
+
"""
|
458
|
+
Initialize a standard bottleneck module.
|
459
|
+
|
460
|
+
Args:
|
461
|
+
c1 (int): Input channels.
|
462
|
+
c2 (int): Output channels.
|
463
|
+
shortcut (bool): Whether to use shortcut connection.
|
464
|
+
g (int): Groups for convolutions.
|
465
|
+
k (Tuple[int, int]): Kernel sizes for convolutions.
|
466
|
+
e (float): Expansion ratio.
|
467
|
+
"""
|
340
468
|
super().__init__()
|
341
469
|
c_ = int(c2 * e) # hidden channels
|
342
470
|
self.cv1 = Conv(c1, c_, k[0], 1)
|
@@ -344,7 +472,7 @@ class Bottleneck(nn.Module):
|
|
344
472
|
self.add = shortcut and c1 == c2
|
345
473
|
|
346
474
|
def forward(self, x):
|
347
|
-
"""
|
475
|
+
"""Apply bottleneck with optional shortcut connection."""
|
348
476
|
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
|
349
477
|
|
350
478
|
|
@@ -352,7 +480,17 @@ class BottleneckCSP(nn.Module):
|
|
352
480
|
"""CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
|
353
481
|
|
354
482
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
355
|
-
"""
|
483
|
+
"""
|
484
|
+
Initialize CSP Bottleneck.
|
485
|
+
|
486
|
+
Args:
|
487
|
+
c1 (int): Input channels.
|
488
|
+
c2 (int): Output channels.
|
489
|
+
n (int): Number of Bottleneck blocks.
|
490
|
+
shortcut (bool): Whether to use shortcut connections.
|
491
|
+
g (int): Groups for convolutions.
|
492
|
+
e (float): Expansion ratio.
|
493
|
+
"""
|
356
494
|
super().__init__()
|
357
495
|
c_ = int(c2 * e) # hidden channels
|
358
496
|
self.cv1 = Conv(c1, c_, 1, 1)
|
@@ -364,7 +502,7 @@ class BottleneckCSP(nn.Module):
|
|
364
502
|
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
|
365
503
|
|
366
504
|
def forward(self, x):
|
367
|
-
"""
|
505
|
+
"""Apply CSP bottleneck with 3 convolutions."""
|
368
506
|
y1 = self.cv3(self.m(self.cv1(x)))
|
369
507
|
y2 = self.cv2(x)
|
370
508
|
return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
|
@@ -374,7 +512,15 @@ class ResNetBlock(nn.Module):
|
|
374
512
|
"""ResNet block with standard convolution layers."""
|
375
513
|
|
376
514
|
def __init__(self, c1, c2, s=1, e=4):
|
377
|
-
"""
|
515
|
+
"""
|
516
|
+
Initialize ResNet block.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
c1 (int): Input channels.
|
520
|
+
c2 (int): Output channels.
|
521
|
+
s (int): Stride.
|
522
|
+
e (int): Expansion ratio.
|
523
|
+
"""
|
378
524
|
super().__init__()
|
379
525
|
c3 = e * c2
|
380
526
|
self.cv1 = Conv(c1, c2, k=1, s=1, act=True)
|
@@ -391,7 +537,17 @@ class ResNetLayer(nn.Module):
|
|
391
537
|
"""ResNet layer with multiple ResNet blocks."""
|
392
538
|
|
393
539
|
def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4):
|
394
|
-
"""
|
540
|
+
"""
|
541
|
+
Initialize ResNet layer.
|
542
|
+
|
543
|
+
Args:
|
544
|
+
c1 (int): Input channels.
|
545
|
+
c2 (int): Output channels.
|
546
|
+
s (int): Stride.
|
547
|
+
is_first (bool): Whether this is the first layer.
|
548
|
+
n (int): Number of ResNet blocks.
|
549
|
+
e (int): Expansion ratio.
|
550
|
+
"""
|
395
551
|
super().__init__()
|
396
552
|
self.is_first = is_first
|
397
553
|
|
@@ -413,7 +569,17 @@ class MaxSigmoidAttnBlock(nn.Module):
|
|
413
569
|
"""Max Sigmoid attention block."""
|
414
570
|
|
415
571
|
def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
|
416
|
-
"""
|
572
|
+
"""
|
573
|
+
Initialize MaxSigmoidAttnBlock.
|
574
|
+
|
575
|
+
Args:
|
576
|
+
c1 (int): Input channels.
|
577
|
+
c2 (int): Output channels.
|
578
|
+
nh (int): Number of heads.
|
579
|
+
ec (int): Embedding channels.
|
580
|
+
gc (int): Guide channels.
|
581
|
+
scale (bool): Whether to use learnable scale parameter.
|
582
|
+
"""
|
417
583
|
super().__init__()
|
418
584
|
self.nh = nh
|
419
585
|
self.hc = c2 // nh
|
@@ -424,7 +590,16 @@ class MaxSigmoidAttnBlock(nn.Module):
|
|
424
590
|
self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
|
425
591
|
|
426
592
|
def forward(self, x, guide):
|
427
|
-
"""
|
593
|
+
"""
|
594
|
+
Forward pass of MaxSigmoidAttnBlock.
|
595
|
+
|
596
|
+
Args:
|
597
|
+
x (torch.Tensor): Input tensor.
|
598
|
+
guide (torch.Tensor): Guide tensor.
|
599
|
+
|
600
|
+
Returns:
|
601
|
+
(torch.Tensor): Output tensor after attention.
|
602
|
+
"""
|
428
603
|
bs, _, h, w = x.shape
|
429
604
|
|
430
605
|
guide = self.gl(guide)
|
@@ -448,7 +623,20 @@ class C2fAttn(nn.Module):
|
|
448
623
|
"""C2f module with an additional attn module."""
|
449
624
|
|
450
625
|
def __init__(self, c1, c2, n=1, ec=128, nh=1, gc=512, shortcut=False, g=1, e=0.5):
|
451
|
-
"""
|
626
|
+
"""
|
627
|
+
Initialize C2f module with attention mechanism.
|
628
|
+
|
629
|
+
Args:
|
630
|
+
c1 (int): Input channels.
|
631
|
+
c2 (int): Output channels.
|
632
|
+
n (int): Number of Bottleneck blocks.
|
633
|
+
ec (int): Embedding channels for attention.
|
634
|
+
nh (int): Number of heads for attention.
|
635
|
+
gc (int): Guide channels for attention.
|
636
|
+
shortcut (bool): Whether to use shortcut connections.
|
637
|
+
g (int): Groups for convolutions.
|
638
|
+
e (float): Expansion ratio.
|
639
|
+
"""
|
452
640
|
super().__init__()
|
453
641
|
self.c = int(c2 * e) # hidden channels
|
454
642
|
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
|
@@ -457,14 +645,32 @@ class C2fAttn(nn.Module):
|
|
457
645
|
self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
|
458
646
|
|
459
647
|
def forward(self, x, guide):
|
460
|
-
"""
|
648
|
+
"""
|
649
|
+
Forward pass through C2f layer with attention.
|
650
|
+
|
651
|
+
Args:
|
652
|
+
x (torch.Tensor): Input tensor.
|
653
|
+
guide (torch.Tensor): Guide tensor for attention.
|
654
|
+
|
655
|
+
Returns:
|
656
|
+
(torch.Tensor): Output tensor after processing.
|
657
|
+
"""
|
461
658
|
y = list(self.cv1(x).chunk(2, 1))
|
462
659
|
y.extend(m(y[-1]) for m in self.m)
|
463
660
|
y.append(self.attn(y[-1], guide))
|
464
661
|
return self.cv2(torch.cat(y, 1))
|
465
662
|
|
466
663
|
def forward_split(self, x, guide):
|
467
|
-
"""
|
664
|
+
"""
|
665
|
+
Forward pass using split() instead of chunk().
|
666
|
+
|
667
|
+
Args:
|
668
|
+
x (torch.Tensor): Input tensor.
|
669
|
+
guide (torch.Tensor): Guide tensor for attention.
|
670
|
+
|
671
|
+
Returns:
|
672
|
+
(torch.Tensor): Output tensor after processing.
|
673
|
+
"""
|
468
674
|
y = list(self.cv1(x).split((self.c, self.c), 1))
|
469
675
|
y.extend(m(y[-1]) for m in self.m)
|
470
676
|
y.append(self.attn(y[-1], guide))
|
@@ -475,7 +681,17 @@ class ImagePoolingAttn(nn.Module):
|
|
475
681
|
"""ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
|
476
682
|
|
477
683
|
def __init__(self, ec=256, ch=(), ct=512, nh=8, k=3, scale=False):
|
478
|
-
"""
|
684
|
+
"""
|
685
|
+
Initialize ImagePoolingAttn module.
|
686
|
+
|
687
|
+
Args:
|
688
|
+
ec (int): Embedding channels.
|
689
|
+
ch (Tuple): Channel dimensions for feature maps.
|
690
|
+
ct (int): Channel dimension for text embeddings.
|
691
|
+
nh (int): Number of attention heads.
|
692
|
+
k (int): Kernel size for pooling.
|
693
|
+
scale (bool): Whether to use learnable scale parameter.
|
694
|
+
"""
|
479
695
|
super().__init__()
|
480
696
|
|
481
697
|
nf = len(ch)
|
@@ -493,7 +709,16 @@ class ImagePoolingAttn(nn.Module):
|
|
493
709
|
self.k = k
|
494
710
|
|
495
711
|
def forward(self, x, text):
|
496
|
-
"""
|
712
|
+
"""
|
713
|
+
Forward pass of ImagePoolingAttn.
|
714
|
+
|
715
|
+
Args:
|
716
|
+
x (List[torch.Tensor]): List of input feature maps.
|
717
|
+
text (torch.Tensor): Text embeddings.
|
718
|
+
|
719
|
+
Returns:
|
720
|
+
(torch.Tensor): Enhanced text embeddings.
|
721
|
+
"""
|
497
722
|
bs = x[0].shape[0]
|
498
723
|
assert len(x) == self.nf
|
499
724
|
num_patches = self.k**2
|
@@ -521,14 +746,23 @@ class ContrastiveHead(nn.Module):
|
|
521
746
|
"""Implements contrastive learning head for region-text similarity in vision-language models."""
|
522
747
|
|
523
748
|
def __init__(self):
|
524
|
-
"""
|
749
|
+
"""Initialize ContrastiveHead with region-text similarity parameters."""
|
525
750
|
super().__init__()
|
526
751
|
# NOTE: use -10.0 to keep the init cls loss consistency with other losses
|
527
752
|
self.bias = nn.Parameter(torch.tensor([-10.0]))
|
528
753
|
self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
|
529
754
|
|
530
755
|
def forward(self, x, w):
|
531
|
-
"""
|
756
|
+
"""
|
757
|
+
Forward function of contrastive learning.
|
758
|
+
|
759
|
+
Args:
|
760
|
+
x (torch.Tensor): Image features.
|
761
|
+
w (torch.Tensor): Text features.
|
762
|
+
|
763
|
+
Returns:
|
764
|
+
(torch.Tensor): Similarity scores.
|
765
|
+
"""
|
532
766
|
x = F.normalize(x, dim=1, p=2)
|
533
767
|
w = F.normalize(w, dim=-1, p=2)
|
534
768
|
x = torch.einsum("bchw,bkc->bkhw", x, w)
|
@@ -544,7 +778,12 @@ class BNContrastiveHead(nn.Module):
|
|
544
778
|
"""
|
545
779
|
|
546
780
|
def __init__(self, embed_dims: int):
|
547
|
-
"""
|
781
|
+
"""
|
782
|
+
Initialize BNContrastiveHead.
|
783
|
+
|
784
|
+
Args:
|
785
|
+
embed_dims (int): Embedding dimensions for features.
|
786
|
+
"""
|
548
787
|
super().__init__()
|
549
788
|
self.norm = nn.BatchNorm2d(embed_dims)
|
550
789
|
# NOTE: use -10.0 to keep the init cls loss consistency with other losses
|
@@ -553,7 +792,16 @@ class BNContrastiveHead(nn.Module):
|
|
553
792
|
self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
|
554
793
|
|
555
794
|
def forward(self, x, w):
|
556
|
-
"""
|
795
|
+
"""
|
796
|
+
Forward function of contrastive learning with batch normalization.
|
797
|
+
|
798
|
+
Args:
|
799
|
+
x (torch.Tensor): Image features.
|
800
|
+
w (torch.Tensor): Text features.
|
801
|
+
|
802
|
+
Returns:
|
803
|
+
(torch.Tensor): Similarity scores.
|
804
|
+
"""
|
557
805
|
x = self.norm(x)
|
558
806
|
w = F.normalize(w, dim=-1, p=2)
|
559
807
|
x = torch.einsum("bchw,bkc->bkhw", x, w)
|
@@ -564,7 +812,17 @@ class RepBottleneck(Bottleneck):
|
|
564
812
|
"""Rep bottleneck."""
|
565
813
|
|
566
814
|
def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
|
567
|
-
"""
|
815
|
+
"""
|
816
|
+
Initialize RepBottleneck.
|
817
|
+
|
818
|
+
Args:
|
819
|
+
c1 (int): Input channels.
|
820
|
+
c2 (int): Output channels.
|
821
|
+
shortcut (bool): Whether to use shortcut connection.
|
822
|
+
g (int): Groups for convolutions.
|
823
|
+
k (Tuple[int, int]): Kernel sizes for convolutions.
|
824
|
+
e (float): Expansion ratio.
|
825
|
+
"""
|
568
826
|
super().__init__(c1, c2, shortcut, g, k, e)
|
569
827
|
c_ = int(c2 * e) # hidden channels
|
570
828
|
self.cv1 = RepConv(c1, c_, k[0], 1)
|
@@ -574,7 +832,17 @@ class RepCSP(C3):
|
|
574
832
|
"""Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""
|
575
833
|
|
576
834
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
577
|
-
"""
|
835
|
+
"""
|
836
|
+
Initialize RepCSP layer.
|
837
|
+
|
838
|
+
Args:
|
839
|
+
c1 (int): Input channels.
|
840
|
+
c2 (int): Output channels.
|
841
|
+
n (int): Number of RepBottleneck blocks.
|
842
|
+
shortcut (bool): Whether to use shortcut connections.
|
843
|
+
g (int): Groups for convolutions.
|
844
|
+
e (float): Expansion ratio.
|
845
|
+
"""
|
578
846
|
super().__init__(c1, c2, n, shortcut, g, e)
|
579
847
|
c_ = int(c2 * e) # hidden channels
|
580
848
|
self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
|
@@ -584,7 +852,16 @@ class RepNCSPELAN4(nn.Module):
|
|
584
852
|
"""CSP-ELAN."""
|
585
853
|
|
586
854
|
def __init__(self, c1, c2, c3, c4, n=1):
|
587
|
-
"""
|
855
|
+
"""
|
856
|
+
Initialize CSP-ELAN layer.
|
857
|
+
|
858
|
+
Args:
|
859
|
+
c1 (int): Input channels.
|
860
|
+
c2 (int): Output channels.
|
861
|
+
c3 (int): Intermediate channels.
|
862
|
+
c4 (int): Intermediate channels for RepCSP.
|
863
|
+
n (int): Number of RepCSP blocks.
|
864
|
+
"""
|
588
865
|
super().__init__()
|
589
866
|
self.c = c3 // 2
|
590
867
|
self.cv1 = Conv(c1, c3, 1, 1)
|
@@ -609,7 +886,15 @@ class ELAN1(RepNCSPELAN4):
|
|
609
886
|
"""ELAN1 module with 4 convolutions."""
|
610
887
|
|
611
888
|
def __init__(self, c1, c2, c3, c4):
|
612
|
-
"""
|
889
|
+
"""
|
890
|
+
Initialize ELAN1 layer.
|
891
|
+
|
892
|
+
Args:
|
893
|
+
c1 (int): Input channels.
|
894
|
+
c2 (int): Output channels.
|
895
|
+
c3 (int): Intermediate channels.
|
896
|
+
c4 (int): Intermediate channels for convolutions.
|
897
|
+
"""
|
613
898
|
super().__init__(c1, c2, c3, c4)
|
614
899
|
self.c = c3 // 2
|
615
900
|
self.cv1 = Conv(c1, c3, 1, 1)
|
@@ -622,7 +907,13 @@ class AConv(nn.Module):
|
|
622
907
|
"""AConv."""
|
623
908
|
|
624
909
|
def __init__(self, c1, c2):
|
625
|
-
"""
|
910
|
+
"""
|
911
|
+
Initialize AConv module.
|
912
|
+
|
913
|
+
Args:
|
914
|
+
c1 (int): Input channels.
|
915
|
+
c2 (int): Output channels.
|
916
|
+
"""
|
626
917
|
super().__init__()
|
627
918
|
self.cv1 = Conv(c1, c2, 3, 2, 1)
|
628
919
|
|
@@ -636,7 +927,13 @@ class ADown(nn.Module):
|
|
636
927
|
"""ADown."""
|
637
928
|
|
638
929
|
def __init__(self, c1, c2):
|
639
|
-
"""
|
930
|
+
"""
|
931
|
+
Initialize ADown module.
|
932
|
+
|
933
|
+
Args:
|
934
|
+
c1 (int): Input channels.
|
935
|
+
c2 (int): Output channels.
|
936
|
+
"""
|
640
937
|
super().__init__()
|
641
938
|
self.c = c2 // 2
|
642
939
|
self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
|
@@ -656,7 +953,15 @@ class SPPELAN(nn.Module):
|
|
656
953
|
"""SPP-ELAN."""
|
657
954
|
|
658
955
|
def __init__(self, c1, c2, c3, k=5):
|
659
|
-
"""
|
956
|
+
"""
|
957
|
+
Initialize SPP-ELAN block.
|
958
|
+
|
959
|
+
Args:
|
960
|
+
c1 (int): Input channels.
|
961
|
+
c2 (int): Output channels.
|
962
|
+
c3 (int): Intermediate channels.
|
963
|
+
k (int): Kernel size for max pooling.
|
964
|
+
"""
|
660
965
|
super().__init__()
|
661
966
|
self.c = c3
|
662
967
|
self.cv1 = Conv(c1, c3, 1, 1)
|
@@ -676,7 +981,17 @@ class CBLinear(nn.Module):
|
|
676
981
|
"""CBLinear."""
|
677
982
|
|
678
983
|
def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):
|
679
|
-
"""
|
984
|
+
"""
|
985
|
+
Initialize CBLinear module.
|
986
|
+
|
987
|
+
Args:
|
988
|
+
c1 (int): Input channels.
|
989
|
+
c2s (List[int]): List of output channel sizes.
|
990
|
+
k (int): Kernel size.
|
991
|
+
s (int): Stride.
|
992
|
+
p (int | None): Padding.
|
993
|
+
g (int): Groups.
|
994
|
+
"""
|
680
995
|
super().__init__()
|
681
996
|
self.c2s = c2s
|
682
997
|
self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
|
@@ -690,12 +1005,25 @@ class CBFuse(nn.Module):
|
|
690
1005
|
"""CBFuse."""
|
691
1006
|
|
692
1007
|
def __init__(self, idx):
|
693
|
-
"""
|
1008
|
+
"""
|
1009
|
+
Initialize CBFuse module.
|
1010
|
+
|
1011
|
+
Args:
|
1012
|
+
idx (List[int]): Indices for feature selection.
|
1013
|
+
"""
|
694
1014
|
super().__init__()
|
695
1015
|
self.idx = idx
|
696
1016
|
|
697
1017
|
def forward(self, xs):
|
698
|
-
"""
|
1018
|
+
"""
|
1019
|
+
Forward pass through CBFuse layer.
|
1020
|
+
|
1021
|
+
Args:
|
1022
|
+
xs (List[torch.Tensor]): List of input tensors.
|
1023
|
+
|
1024
|
+
Returns:
|
1025
|
+
(torch.Tensor): Fused output tensor.
|
1026
|
+
"""
|
699
1027
|
target_size = xs[-1].shape[2:]
|
700
1028
|
res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
|
701
1029
|
return torch.sum(torch.stack(res + xs[-1:]), dim=0)
|
@@ -705,8 +1033,16 @@ class C3f(nn.Module):
|
|
705
1033
|
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
706
1034
|
|
707
1035
|
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
|
708
|
-
"""
|
709
|
-
|
1036
|
+
"""
|
1037
|
+
Initialize CSP bottleneck layer with two convolutions.
|
1038
|
+
|
1039
|
+
Args:
|
1040
|
+
c1 (int): Input channels.
|
1041
|
+
c2 (int): Output channels.
|
1042
|
+
n (int): Number of Bottleneck blocks.
|
1043
|
+
shortcut (bool): Whether to use shortcut connections.
|
1044
|
+
g (int): Groups for convolutions.
|
1045
|
+
e (float): Expansion ratio.
|
710
1046
|
"""
|
711
1047
|
super().__init__()
|
712
1048
|
c_ = int(c2 * e) # hidden channels
|
@@ -716,7 +1052,7 @@ class C3f(nn.Module):
|
|
716
1052
|
self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
|
717
1053
|
|
718
1054
|
def forward(self, x):
|
719
|
-
"""Forward pass through
|
1055
|
+
"""Forward pass through C3f layer."""
|
720
1056
|
y = [self.cv2(x), self.cv1(x)]
|
721
1057
|
y.extend(m(y[-1]) for m in self.m)
|
722
1058
|
return self.cv3(torch.cat(y, 1))
|
@@ -726,7 +1062,18 @@ class C3k2(C2f):
|
|
726
1062
|
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
727
1063
|
|
728
1064
|
def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
|
729
|
-
"""
|
1065
|
+
"""
|
1066
|
+
Initialize C3k2 module.
|
1067
|
+
|
1068
|
+
Args:
|
1069
|
+
c1 (int): Input channels.
|
1070
|
+
c2 (int): Output channels.
|
1071
|
+
n (int): Number of blocks.
|
1072
|
+
c3k (bool): Whether to use C3k blocks.
|
1073
|
+
e (float): Expansion ratio.
|
1074
|
+
g (int): Groups for convolutions.
|
1075
|
+
shortcut (bool): Whether to use shortcut connections.
|
1076
|
+
"""
|
730
1077
|
super().__init__(c1, c2, n, shortcut, g, e)
|
731
1078
|
self.m = nn.ModuleList(
|
732
1079
|
C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
|
@@ -737,7 +1084,18 @@ class C3k(C3):
|
|
737
1084
|
"""C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
|
738
1085
|
|
739
1086
|
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
|
740
|
-
"""
|
1087
|
+
"""
|
1088
|
+
Initialize C3k module.
|
1089
|
+
|
1090
|
+
Args:
|
1091
|
+
c1 (int): Input channels.
|
1092
|
+
c2 (int): Output channels.
|
1093
|
+
n (int): Number of Bottleneck blocks.
|
1094
|
+
shortcut (bool): Whether to use shortcut connections.
|
1095
|
+
g (int): Groups for convolutions.
|
1096
|
+
e (float): Expansion ratio.
|
1097
|
+
k (int): Kernel size.
|
1098
|
+
"""
|
741
1099
|
super().__init__(c1, c2, n, shortcut, g, e)
|
742
1100
|
c_ = int(c2 * e) # hidden channels
|
743
1101
|
# self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
|
@@ -748,7 +1106,12 @@ class RepVGGDW(torch.nn.Module):
|
|
748
1106
|
"""RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
|
749
1107
|
|
750
1108
|
def __init__(self, ed) -> None:
|
751
|
-
"""
|
1109
|
+
"""
|
1110
|
+
Initialize RepVGGDW module.
|
1111
|
+
|
1112
|
+
Args:
|
1113
|
+
ed (int): Input and output channels.
|
1114
|
+
"""
|
752
1115
|
super().__init__()
|
753
1116
|
self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
|
754
1117
|
self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
|
@@ -757,7 +1120,7 @@ class RepVGGDW(torch.nn.Module):
|
|
757
1120
|
|
758
1121
|
def forward(self, x):
|
759
1122
|
"""
|
760
|
-
|
1123
|
+
Perform a forward pass of the RepVGGDW block.
|
761
1124
|
|
762
1125
|
Args:
|
763
1126
|
x (torch.Tensor): Input tensor.
|
@@ -769,7 +1132,7 @@ class RepVGGDW(torch.nn.Module):
|
|
769
1132
|
|
770
1133
|
def forward_fuse(self, x):
|
771
1134
|
"""
|
772
|
-
|
1135
|
+
Perform a forward pass of the RepVGGDW block without fusing the convolutions.
|
773
1136
|
|
774
1137
|
Args:
|
775
1138
|
x (torch.Tensor): Input tensor.
|
@@ -782,7 +1145,7 @@ class RepVGGDW(torch.nn.Module):
|
|
782
1145
|
@torch.no_grad()
|
783
1146
|
def fuse(self):
|
784
1147
|
"""
|
785
|
-
|
1148
|
+
Fuse the convolutional layers in the RepVGGDW block.
|
786
1149
|
|
787
1150
|
This method fuses the convolutional layers and updates the weights and biases accordingly.
|
788
1151
|
"""
|
@@ -819,7 +1182,16 @@ class CIB(nn.Module):
|
|
819
1182
|
"""
|
820
1183
|
|
821
1184
|
def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
|
822
|
-
"""
|
1185
|
+
"""
|
1186
|
+
Initialize the CIB module.
|
1187
|
+
|
1188
|
+
Args:
|
1189
|
+
c1 (int): Input channels.
|
1190
|
+
c2 (int): Output channels.
|
1191
|
+
shortcut (bool): Whether to use shortcut connection.
|
1192
|
+
e (float): Expansion ratio.
|
1193
|
+
lk (bool): Whether to use RepVGGDW.
|
1194
|
+
"""
|
823
1195
|
super().__init__()
|
824
1196
|
c_ = int(c2 * e) # hidden channels
|
825
1197
|
self.cv1 = nn.Sequential(
|
@@ -860,7 +1232,18 @@ class C2fCIB(C2f):
|
|
860
1232
|
"""
|
861
1233
|
|
862
1234
|
def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
|
863
|
-
"""
|
1235
|
+
"""
|
1236
|
+
Initialize C2fCIB module.
|
1237
|
+
|
1238
|
+
Args:
|
1239
|
+
c1 (int): Input channels.
|
1240
|
+
c2 (int): Output channels.
|
1241
|
+
n (int): Number of CIB modules.
|
1242
|
+
shortcut (bool): Whether to use shortcut connection.
|
1243
|
+
lk (bool): Whether to use local key connection.
|
1244
|
+
g (int): Groups for convolutions.
|
1245
|
+
e (float): Expansion ratio.
|
1246
|
+
"""
|
864
1247
|
super().__init__(c1, c2, n, shortcut, g, e)
|
865
1248
|
self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
|
866
1249
|
|
@@ -885,7 +1268,14 @@ class Attention(nn.Module):
|
|
885
1268
|
"""
|
886
1269
|
|
887
1270
|
def __init__(self, dim, num_heads=8, attn_ratio=0.5):
|
888
|
-
"""
|
1271
|
+
"""
|
1272
|
+
Initialize multi-head attention module.
|
1273
|
+
|
1274
|
+
Args:
|
1275
|
+
dim (int): Input dimension.
|
1276
|
+
num_heads (int): Number of attention heads.
|
1277
|
+
attn_ratio (float): Attention ratio for key dimension.
|
1278
|
+
"""
|
889
1279
|
super().__init__()
|
890
1280
|
self.num_heads = num_heads
|
891
1281
|
self.head_dim = dim // num_heads
|
@@ -944,7 +1334,15 @@ class PSABlock(nn.Module):
|
|
944
1334
|
"""
|
945
1335
|
|
946
1336
|
def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
|
947
|
-
"""
|
1337
|
+
"""
|
1338
|
+
Initialize the PSABlock.
|
1339
|
+
|
1340
|
+
Args:
|
1341
|
+
c (int): Input and output channels.
|
1342
|
+
attn_ratio (float): Attention ratio for key dimension.
|
1343
|
+
num_heads (int): Number of attention heads.
|
1344
|
+
shortcut (bool): Whether to use shortcut connections.
|
1345
|
+
"""
|
948
1346
|
super().__init__()
|
949
1347
|
|
950
1348
|
self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
|
@@ -952,7 +1350,15 @@ class PSABlock(nn.Module):
|
|
952
1350
|
self.add = shortcut
|
953
1351
|
|
954
1352
|
def forward(self, x):
|
955
|
-
"""
|
1353
|
+
"""
|
1354
|
+
Execute a forward pass through PSABlock.
|
1355
|
+
|
1356
|
+
Args:
|
1357
|
+
x (torch.Tensor): Input tensor.
|
1358
|
+
|
1359
|
+
Returns:
|
1360
|
+
(torch.Tensor): Output tensor after attention and feed-forward processing.
|
1361
|
+
"""
|
956
1362
|
x = x + self.attn(x) if self.add else self.attn(x)
|
957
1363
|
x = x + self.ffn(x) if self.add else self.ffn(x)
|
958
1364
|
return x
|
@@ -983,7 +1389,14 @@ class PSA(nn.Module):
|
|
983
1389
|
"""
|
984
1390
|
|
985
1391
|
def __init__(self, c1, c2, e=0.5):
|
986
|
-
"""
|
1392
|
+
"""
|
1393
|
+
Initialize PSA module.
|
1394
|
+
|
1395
|
+
Args:
|
1396
|
+
c1 (int): Input channels.
|
1397
|
+
c2 (int): Output channels.
|
1398
|
+
e (float): Expansion ratio.
|
1399
|
+
"""
|
987
1400
|
super().__init__()
|
988
1401
|
assert c1 == c2
|
989
1402
|
self.c = int(c1 * e)
|
@@ -994,7 +1407,15 @@ class PSA(nn.Module):
|
|
994
1407
|
self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
|
995
1408
|
|
996
1409
|
def forward(self, x):
|
997
|
-
"""
|
1410
|
+
"""
|
1411
|
+
Execute forward pass in PSA module.
|
1412
|
+
|
1413
|
+
Args:
|
1414
|
+
x (torch.Tensor): Input tensor.
|
1415
|
+
|
1416
|
+
Returns:
|
1417
|
+
(torch.Tensor): Output tensor after attention and feed-forward processing.
|
1418
|
+
"""
|
998
1419
|
a, b = self.cv1(x).split((self.c, self.c), dim=1)
|
999
1420
|
b = b + self.attn(b)
|
1000
1421
|
b = b + self.ffn(b)
|
@@ -1027,7 +1448,15 @@ class C2PSA(nn.Module):
|
|
1027
1448
|
"""
|
1028
1449
|
|
1029
1450
|
def __init__(self, c1, c2, n=1, e=0.5):
|
1030
|
-
"""
|
1451
|
+
"""
|
1452
|
+
Initialize C2PSA module.
|
1453
|
+
|
1454
|
+
Args:
|
1455
|
+
c1 (int): Input channels.
|
1456
|
+
c2 (int): Output channels.
|
1457
|
+
n (int): Number of PSABlock modules.
|
1458
|
+
e (float): Expansion ratio.
|
1459
|
+
"""
|
1031
1460
|
super().__init__()
|
1032
1461
|
assert c1 == c2
|
1033
1462
|
self.c = int(c1 * e)
|
@@ -1037,7 +1466,15 @@ class C2PSA(nn.Module):
|
|
1037
1466
|
self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
|
1038
1467
|
|
1039
1468
|
def forward(self, x):
|
1040
|
-
"""
|
1469
|
+
"""
|
1470
|
+
Process the input tensor through a series of PSA blocks.
|
1471
|
+
|
1472
|
+
Args:
|
1473
|
+
x (torch.Tensor): Input tensor.
|
1474
|
+
|
1475
|
+
Returns:
|
1476
|
+
(torch.Tensor): Output tensor after processing.
|
1477
|
+
"""
|
1041
1478
|
a, b = self.cv1(x).split((self.c, self.c), dim=1)
|
1042
1479
|
b = self.m(b)
|
1043
1480
|
return self.cv2(torch.cat((a, b), 1))
|
@@ -1069,7 +1506,15 @@ class C2fPSA(C2f):
|
|
1069
1506
|
"""
|
1070
1507
|
|
1071
1508
|
def __init__(self, c1, c2, n=1, e=0.5):
|
1072
|
-
"""
|
1509
|
+
"""
|
1510
|
+
Initialize C2fPSA module.
|
1511
|
+
|
1512
|
+
Args:
|
1513
|
+
c1 (int): Input channels.
|
1514
|
+
c2 (int): Output channels.
|
1515
|
+
n (int): Number of PSABlock modules.
|
1516
|
+
e (float): Expansion ratio.
|
1517
|
+
"""
|
1073
1518
|
assert c1 == c2
|
1074
1519
|
super().__init__(c1, c2, n=n, e=e)
|
1075
1520
|
self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
|
@@ -1100,13 +1545,29 @@ class SCDown(nn.Module):
|
|
1100
1545
|
"""
|
1101
1546
|
|
1102
1547
|
def __init__(self, c1, c2, k, s):
|
1103
|
-
"""
|
1548
|
+
"""
|
1549
|
+
Initialize SCDown module.
|
1550
|
+
|
1551
|
+
Args:
|
1552
|
+
c1 (int): Input channels.
|
1553
|
+
c2 (int): Output channels.
|
1554
|
+
k (int): Kernel size.
|
1555
|
+
s (int): Stride.
|
1556
|
+
"""
|
1104
1557
|
super().__init__()
|
1105
1558
|
self.cv1 = Conv(c1, c2, 1, 1)
|
1106
1559
|
self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
|
1107
1560
|
|
1108
1561
|
def forward(self, x):
|
1109
|
-
"""
|
1562
|
+
"""
|
1563
|
+
Apply convolution and downsampling to the input tensor.
|
1564
|
+
|
1565
|
+
Args:
|
1566
|
+
x (torch.Tensor): Input tensor.
|
1567
|
+
|
1568
|
+
Returns:
|
1569
|
+
(torch.Tensor): Downsampled output tensor.
|
1570
|
+
"""
|
1110
1571
|
return self.cv2(self.cv1(x))
|
1111
1572
|
|
1112
1573
|
|
@@ -1128,7 +1589,16 @@ class TorchVision(nn.Module):
|
|
1128
1589
|
"""
|
1129
1590
|
|
1130
1591
|
def __init__(self, model, weights="DEFAULT", unwrap=True, truncate=2, split=False):
|
1131
|
-
"""
|
1592
|
+
"""
|
1593
|
+
Load the model and weights from torchvision.
|
1594
|
+
|
1595
|
+
Args:
|
1596
|
+
model (str): Name of the torchvision model to load.
|
1597
|
+
weights (str): Pre-trained weights to load.
|
1598
|
+
unwrap (bool): Whether to unwrap the model.
|
1599
|
+
truncate (int): Number of layers to truncate.
|
1600
|
+
split (bool): Whether to split the output.
|
1601
|
+
"""
|
1132
1602
|
import torchvision # scope for faster 'import ultralytics'
|
1133
1603
|
|
1134
1604
|
super().__init__()
|
@@ -1147,7 +1617,15 @@ class TorchVision(nn.Module):
|
|
1147
1617
|
self.m.head = self.m.heads = nn.Identity()
|
1148
1618
|
|
1149
1619
|
def forward(self, x):
|
1150
|
-
"""
|
1620
|
+
"""
|
1621
|
+
Forward pass through the model.
|
1622
|
+
|
1623
|
+
Args:
|
1624
|
+
x (torch.Tensor): Input tensor.
|
1625
|
+
|
1626
|
+
Returns:
|
1627
|
+
(torch.Tensor | List[torch.Tensor]): Output tensor or list of tensors.
|
1628
|
+
"""
|
1151
1629
|
if self.split:
|
1152
1630
|
y = [x]
|
1153
1631
|
y.extend(m(y[-1]) for m in self.m)
|
@@ -1184,7 +1662,7 @@ class AAttn(nn.Module):
|
|
1184
1662
|
|
1185
1663
|
def __init__(self, dim, num_heads, area=1):
|
1186
1664
|
"""
|
1187
|
-
|
1665
|
+
Initialize an Area-attention module for YOLO models.
|
1188
1666
|
|
1189
1667
|
Args:
|
1190
1668
|
dim (int): Number of hidden channels.
|
@@ -1203,7 +1681,15 @@ class AAttn(nn.Module):
|
|
1203
1681
|
self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
|
1204
1682
|
|
1205
1683
|
def forward(self, x):
|
1206
|
-
"""
|
1684
|
+
"""
|
1685
|
+
Process the input tensor through the area-attention.
|
1686
|
+
|
1687
|
+
Args:
|
1688
|
+
x (torch.Tensor): Input tensor.
|
1689
|
+
|
1690
|
+
Returns:
|
1691
|
+
(torch.Tensor): Output tensor after area-attention.
|
1692
|
+
"""
|
1207
1693
|
B, C, H, W = x.shape
|
1208
1694
|
N = H * W
|
1209
1695
|
|
@@ -1260,11 +1746,7 @@ class ABlock(nn.Module):
|
|
1260
1746
|
|
1261
1747
|
def __init__(self, dim, num_heads, mlp_ratio=1.2, area=1):
|
1262
1748
|
"""
|
1263
|
-
|
1264
|
-
|
1265
|
-
This module implements an area-attention mechanism combined with a feed-forward network for processing feature
|
1266
|
-
maps. It uses a novel area-based attention approach that is more efficient than traditional self-attention
|
1267
|
-
while maintaining effectiveness.
|
1749
|
+
Initialize an Area-attention block module.
|
1268
1750
|
|
1269
1751
|
Args:
|
1270
1752
|
dim (int): Number of input channels.
|
@@ -1281,14 +1763,27 @@ class ABlock(nn.Module):
|
|
1281
1763
|
self.apply(self._init_weights)
|
1282
1764
|
|
1283
1765
|
def _init_weights(self, m):
|
1284
|
-
"""
|
1766
|
+
"""
|
1767
|
+
Initialize weights using a truncated normal distribution.
|
1768
|
+
|
1769
|
+
Args:
|
1770
|
+
m (nn.Module): Module to initialize.
|
1771
|
+
"""
|
1285
1772
|
if isinstance(m, nn.Conv2d):
|
1286
1773
|
nn.init.trunc_normal_(m.weight, std=0.02)
|
1287
1774
|
if m.bias is not None:
|
1288
1775
|
nn.init.constant_(m.bias, 0)
|
1289
1776
|
|
1290
1777
|
def forward(self, x):
|
1291
|
-
"""
|
1778
|
+
"""
|
1779
|
+
Forward pass through ABlock.
|
1780
|
+
|
1781
|
+
Args:
|
1782
|
+
x (torch.Tensor): Input tensor.
|
1783
|
+
|
1784
|
+
Returns:
|
1785
|
+
(torch.Tensor): Output tensor after area-attention and feed-forward processing.
|
1786
|
+
"""
|
1292
1787
|
x = x + self.attn(x)
|
1293
1788
|
return x + self.mlp(x)
|
1294
1789
|
|
@@ -1319,7 +1814,7 @@ class A2C2f(nn.Module):
|
|
1319
1814
|
|
1320
1815
|
def __init__(self, c1, c2, n=1, a2=True, area=1, residual=False, mlp_ratio=2.0, e=0.5, g=1, shortcut=True):
|
1321
1816
|
"""
|
1322
|
-
Area-Attention C2f module
|
1817
|
+
Initialize Area-Attention C2f module.
|
1323
1818
|
|
1324
1819
|
Args:
|
1325
1820
|
c1 (int): Number of input channels.
|
@@ -1349,7 +1844,15 @@ class A2C2f(nn.Module):
|
|
1349
1844
|
)
|
1350
1845
|
|
1351
1846
|
def forward(self, x):
|
1352
|
-
"""
|
1847
|
+
"""
|
1848
|
+
Forward pass through A2C2f layer.
|
1849
|
+
|
1850
|
+
Args:
|
1851
|
+
x (torch.Tensor): Input tensor.
|
1852
|
+
|
1853
|
+
Returns:
|
1854
|
+
(torch.Tensor): Output tensor after processing.
|
1855
|
+
"""
|
1353
1856
|
y = [self.cv1(x)]
|
1354
1857
|
y.extend(m(y[-1]) for m in self.m)
|
1355
1858
|
y = self.cv2(torch.cat(y, 1))
|