dgenerate-ultralytics-headless 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/METADATA +41 -34
- dgenerate_ultralytics_headless-8.3.224.dist-info/RECORD +285 -0
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/WHEEL +1 -1
- tests/__init__.py +7 -6
- tests/conftest.py +15 -39
- tests/test_cli.py +17 -17
- tests/test_cuda.py +17 -8
- tests/test_engine.py +36 -10
- tests/test_exports.py +98 -37
- tests/test_integrations.py +12 -15
- tests/test_python.py +126 -82
- tests/test_solutions.py +319 -135
- ultralytics/__init__.py +27 -9
- ultralytics/cfg/__init__.py +83 -87
- ultralytics/cfg/datasets/Argoverse.yaml +4 -4
- ultralytics/cfg/datasets/DOTAv1.5.yaml +2 -2
- ultralytics/cfg/datasets/DOTAv1.yaml +2 -2
- ultralytics/cfg/datasets/GlobalWheat2020.yaml +2 -2
- ultralytics/cfg/datasets/HomeObjects-3K.yaml +4 -5
- ultralytics/cfg/datasets/ImageNet.yaml +3 -3
- ultralytics/cfg/datasets/Objects365.yaml +24 -20
- ultralytics/cfg/datasets/SKU-110K.yaml +9 -9
- ultralytics/cfg/datasets/VOC.yaml +10 -13
- ultralytics/cfg/datasets/VisDrone.yaml +43 -33
- ultralytics/cfg/datasets/african-wildlife.yaml +5 -5
- ultralytics/cfg/datasets/brain-tumor.yaml +4 -5
- ultralytics/cfg/datasets/carparts-seg.yaml +5 -5
- ultralytics/cfg/datasets/coco-pose.yaml +26 -4
- ultralytics/cfg/datasets/coco.yaml +4 -4
- ultralytics/cfg/datasets/coco128-seg.yaml +2 -2
- ultralytics/cfg/datasets/coco128.yaml +2 -2
- ultralytics/cfg/datasets/coco8-grayscale.yaml +103 -0
- ultralytics/cfg/datasets/coco8-multispectral.yaml +2 -2
- ultralytics/cfg/datasets/coco8-pose.yaml +23 -2
- ultralytics/cfg/datasets/coco8-seg.yaml +2 -2
- ultralytics/cfg/datasets/coco8.yaml +2 -2
- ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
- ultralytics/cfg/datasets/crack-seg.yaml +5 -5
- ultralytics/cfg/datasets/dog-pose.yaml +32 -4
- ultralytics/cfg/datasets/dota8-multispectral.yaml +2 -2
- ultralytics/cfg/datasets/dota8.yaml +2 -2
- ultralytics/cfg/datasets/hand-keypoints.yaml +29 -4
- ultralytics/cfg/datasets/lvis.yaml +9 -9
- ultralytics/cfg/datasets/medical-pills.yaml +4 -5
- ultralytics/cfg/datasets/open-images-v7.yaml +7 -10
- ultralytics/cfg/datasets/package-seg.yaml +5 -5
- ultralytics/cfg/datasets/signature.yaml +4 -4
- ultralytics/cfg/datasets/tiger-pose.yaml +20 -4
- ultralytics/cfg/datasets/xView.yaml +5 -5
- ultralytics/cfg/default.yaml +96 -93
- ultralytics/cfg/trackers/botsort.yaml +16 -17
- ultralytics/cfg/trackers/bytetrack.yaml +9 -11
- ultralytics/data/__init__.py +4 -4
- ultralytics/data/annotator.py +12 -12
- ultralytics/data/augment.py +531 -564
- ultralytics/data/base.py +76 -81
- ultralytics/data/build.py +206 -42
- ultralytics/data/converter.py +179 -78
- ultralytics/data/dataset.py +121 -121
- ultralytics/data/loaders.py +114 -91
- ultralytics/data/split.py +28 -15
- ultralytics/data/split_dota.py +67 -48
- ultralytics/data/utils.py +110 -89
- ultralytics/engine/exporter.py +422 -460
- ultralytics/engine/model.py +224 -252
- ultralytics/engine/predictor.py +94 -89
- ultralytics/engine/results.py +345 -595
- ultralytics/engine/trainer.py +231 -134
- ultralytics/engine/tuner.py +279 -73
- ultralytics/engine/validator.py +53 -46
- ultralytics/hub/__init__.py +26 -28
- ultralytics/hub/auth.py +30 -16
- ultralytics/hub/google/__init__.py +34 -36
- ultralytics/hub/session.py +53 -77
- ultralytics/hub/utils.py +23 -109
- ultralytics/models/__init__.py +1 -1
- ultralytics/models/fastsam/__init__.py +1 -1
- ultralytics/models/fastsam/model.py +36 -18
- ultralytics/models/fastsam/predict.py +33 -44
- ultralytics/models/fastsam/utils.py +4 -5
- ultralytics/models/fastsam/val.py +12 -14
- ultralytics/models/nas/__init__.py +1 -1
- ultralytics/models/nas/model.py +16 -20
- ultralytics/models/nas/predict.py +12 -14
- ultralytics/models/nas/val.py +4 -5
- ultralytics/models/rtdetr/__init__.py +1 -1
- ultralytics/models/rtdetr/model.py +9 -9
- ultralytics/models/rtdetr/predict.py +22 -17
- ultralytics/models/rtdetr/train.py +20 -16
- ultralytics/models/rtdetr/val.py +79 -59
- ultralytics/models/sam/__init__.py +8 -2
- ultralytics/models/sam/amg.py +53 -38
- ultralytics/models/sam/build.py +29 -31
- ultralytics/models/sam/model.py +33 -38
- ultralytics/models/sam/modules/blocks.py +159 -182
- ultralytics/models/sam/modules/decoders.py +38 -47
- ultralytics/models/sam/modules/encoders.py +114 -133
- ultralytics/models/sam/modules/memory_attention.py +38 -31
- ultralytics/models/sam/modules/sam.py +114 -93
- ultralytics/models/sam/modules/tiny_encoder.py +268 -291
- ultralytics/models/sam/modules/transformer.py +59 -66
- ultralytics/models/sam/modules/utils.py +55 -72
- ultralytics/models/sam/predict.py +745 -341
- ultralytics/models/utils/loss.py +118 -107
- ultralytics/models/utils/ops.py +118 -71
- ultralytics/models/yolo/__init__.py +1 -1
- ultralytics/models/yolo/classify/predict.py +28 -26
- ultralytics/models/yolo/classify/train.py +50 -81
- ultralytics/models/yolo/classify/val.py +68 -61
- ultralytics/models/yolo/detect/predict.py +12 -15
- ultralytics/models/yolo/detect/train.py +56 -46
- ultralytics/models/yolo/detect/val.py +279 -223
- ultralytics/models/yolo/model.py +167 -86
- ultralytics/models/yolo/obb/predict.py +7 -11
- ultralytics/models/yolo/obb/train.py +23 -25
- ultralytics/models/yolo/obb/val.py +107 -99
- ultralytics/models/yolo/pose/__init__.py +1 -1
- ultralytics/models/yolo/pose/predict.py +12 -14
- ultralytics/models/yolo/pose/train.py +31 -69
- ultralytics/models/yolo/pose/val.py +119 -254
- ultralytics/models/yolo/segment/predict.py +21 -25
- ultralytics/models/yolo/segment/train.py +12 -66
- ultralytics/models/yolo/segment/val.py +126 -305
- ultralytics/models/yolo/world/train.py +53 -45
- ultralytics/models/yolo/world/train_world.py +51 -32
- ultralytics/models/yolo/yoloe/__init__.py +7 -7
- ultralytics/models/yolo/yoloe/predict.py +30 -37
- ultralytics/models/yolo/yoloe/train.py +89 -71
- ultralytics/models/yolo/yoloe/train_seg.py +15 -17
- ultralytics/models/yolo/yoloe/val.py +56 -41
- ultralytics/nn/__init__.py +9 -11
- ultralytics/nn/autobackend.py +179 -107
- ultralytics/nn/modules/__init__.py +67 -67
- ultralytics/nn/modules/activation.py +8 -7
- ultralytics/nn/modules/block.py +302 -323
- ultralytics/nn/modules/conv.py +61 -104
- ultralytics/nn/modules/head.py +488 -186
- ultralytics/nn/modules/transformer.py +183 -123
- ultralytics/nn/modules/utils.py +15 -20
- ultralytics/nn/tasks.py +327 -203
- ultralytics/nn/text_model.py +81 -65
- ultralytics/py.typed +1 -0
- ultralytics/solutions/__init__.py +12 -12
- ultralytics/solutions/ai_gym.py +19 -27
- ultralytics/solutions/analytics.py +36 -26
- ultralytics/solutions/config.py +29 -28
- ultralytics/solutions/distance_calculation.py +23 -24
- ultralytics/solutions/heatmap.py +17 -19
- ultralytics/solutions/instance_segmentation.py +21 -19
- ultralytics/solutions/object_blurrer.py +16 -17
- ultralytics/solutions/object_counter.py +48 -53
- ultralytics/solutions/object_cropper.py +22 -16
- ultralytics/solutions/parking_management.py +61 -58
- ultralytics/solutions/queue_management.py +19 -19
- ultralytics/solutions/region_counter.py +63 -50
- ultralytics/solutions/security_alarm.py +22 -25
- ultralytics/solutions/similarity_search.py +107 -60
- ultralytics/solutions/solutions.py +343 -262
- ultralytics/solutions/speed_estimation.py +35 -31
- ultralytics/solutions/streamlit_inference.py +104 -40
- ultralytics/solutions/templates/similarity-search.html +31 -24
- ultralytics/solutions/trackzone.py +24 -24
- ultralytics/solutions/vision_eye.py +11 -12
- ultralytics/trackers/__init__.py +1 -1
- ultralytics/trackers/basetrack.py +18 -27
- ultralytics/trackers/bot_sort.py +48 -39
- ultralytics/trackers/byte_tracker.py +94 -94
- ultralytics/trackers/track.py +7 -16
- ultralytics/trackers/utils/gmc.py +37 -69
- ultralytics/trackers/utils/kalman_filter.py +68 -76
- ultralytics/trackers/utils/matching.py +13 -17
- ultralytics/utils/__init__.py +251 -275
- ultralytics/utils/autobatch.py +19 -7
- ultralytics/utils/autodevice.py +68 -38
- ultralytics/utils/benchmarks.py +169 -130
- ultralytics/utils/callbacks/base.py +12 -13
- ultralytics/utils/callbacks/clearml.py +14 -15
- ultralytics/utils/callbacks/comet.py +139 -66
- ultralytics/utils/callbacks/dvc.py +19 -27
- ultralytics/utils/callbacks/hub.py +8 -6
- ultralytics/utils/callbacks/mlflow.py +6 -10
- ultralytics/utils/callbacks/neptune.py +11 -19
- ultralytics/utils/callbacks/platform.py +73 -0
- ultralytics/utils/callbacks/raytune.py +3 -4
- ultralytics/utils/callbacks/tensorboard.py +9 -12
- ultralytics/utils/callbacks/wb.py +33 -30
- ultralytics/utils/checks.py +163 -114
- ultralytics/utils/cpu.py +89 -0
- ultralytics/utils/dist.py +24 -20
- ultralytics/utils/downloads.py +176 -146
- ultralytics/utils/errors.py +11 -13
- ultralytics/utils/events.py +113 -0
- ultralytics/utils/export/__init__.py +7 -0
- ultralytics/utils/{export.py → export/engine.py} +81 -63
- ultralytics/utils/export/imx.py +294 -0
- ultralytics/utils/export/tensorflow.py +217 -0
- ultralytics/utils/files.py +33 -36
- ultralytics/utils/git.py +137 -0
- ultralytics/utils/instance.py +105 -120
- ultralytics/utils/logger.py +404 -0
- ultralytics/utils/loss.py +99 -61
- ultralytics/utils/metrics.py +649 -478
- ultralytics/utils/nms.py +337 -0
- ultralytics/utils/ops.py +263 -451
- ultralytics/utils/patches.py +70 -31
- ultralytics/utils/plotting.py +253 -223
- ultralytics/utils/tal.py +48 -61
- ultralytics/utils/torch_utils.py +244 -251
- ultralytics/utils/tqdm.py +438 -0
- ultralytics/utils/triton.py +22 -23
- ultralytics/utils/tuner.py +11 -10
- dgenerate_ultralytics_headless-8.3.137.dist-info/RECORD +0 -272
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/top_level.txt +0 -0
ultralytics/nn/modules/block.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
|
2
2
|
"""Block modules."""
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import torch
|
|
5
7
|
import torch.nn as nn
|
|
6
8
|
import torch.nn.functional as F
|
|
@@ -11,64 +13,67 @@ from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
|
|
|
11
13
|
from .transformer import TransformerBlock
|
|
12
14
|
|
|
13
15
|
__all__ = (
|
|
14
|
-
"DFL",
|
|
15
|
-
"HGBlock",
|
|
16
|
-
"HGStem",
|
|
17
|
-
"SPP",
|
|
18
|
-
"SPPF",
|
|
19
16
|
"C1",
|
|
20
17
|
"C2",
|
|
18
|
+
"C2PSA",
|
|
21
19
|
"C3",
|
|
20
|
+
"C3TR",
|
|
21
|
+
"CIB",
|
|
22
|
+
"DFL",
|
|
23
|
+
"ELAN1",
|
|
24
|
+
"PSA",
|
|
25
|
+
"SPP",
|
|
26
|
+
"SPPELAN",
|
|
27
|
+
"SPPF",
|
|
28
|
+
"AConv",
|
|
29
|
+
"ADown",
|
|
30
|
+
"Attention",
|
|
31
|
+
"BNContrastiveHead",
|
|
32
|
+
"Bottleneck",
|
|
33
|
+
"BottleneckCSP",
|
|
22
34
|
"C2f",
|
|
23
35
|
"C2fAttn",
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"BNContrastiveHead",
|
|
27
|
-
"C3x",
|
|
28
|
-
"C3TR",
|
|
36
|
+
"C2fCIB",
|
|
37
|
+
"C2fPSA",
|
|
29
38
|
"C3Ghost",
|
|
39
|
+
"C3k2",
|
|
40
|
+
"C3x",
|
|
41
|
+
"CBFuse",
|
|
42
|
+
"CBLinear",
|
|
43
|
+
"ContrastiveHead",
|
|
30
44
|
"GhostBottleneck",
|
|
31
|
-
"
|
|
32
|
-
"
|
|
45
|
+
"HGBlock",
|
|
46
|
+
"HGStem",
|
|
47
|
+
"ImagePoolingAttn",
|
|
33
48
|
"Proto",
|
|
34
49
|
"RepC3",
|
|
35
|
-
"ResNetLayer",
|
|
36
50
|
"RepNCSPELAN4",
|
|
37
|
-
"ELAN1",
|
|
38
|
-
"ADown",
|
|
39
|
-
"AConv",
|
|
40
|
-
"SPPELAN",
|
|
41
|
-
"CBFuse",
|
|
42
|
-
"CBLinear",
|
|
43
|
-
"C3k2",
|
|
44
|
-
"C2fPSA",
|
|
45
|
-
"C2PSA",
|
|
46
51
|
"RepVGGDW",
|
|
47
|
-
"
|
|
48
|
-
"C2fCIB",
|
|
49
|
-
"Attention",
|
|
50
|
-
"PSA",
|
|
52
|
+
"ResNetLayer",
|
|
51
53
|
"SCDown",
|
|
52
54
|
"TorchVision",
|
|
53
55
|
)
|
|
54
56
|
|
|
55
57
|
|
|
56
58
|
class DFL(nn.Module):
|
|
57
|
-
"""
|
|
58
|
-
Integral module of Distribution Focal Loss (DFL).
|
|
59
|
+
"""Integral module of Distribution Focal Loss (DFL).
|
|
59
60
|
|
|
60
61
|
Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
|
|
61
62
|
"""
|
|
62
63
|
|
|
63
|
-
def __init__(self, c1=16):
|
|
64
|
-
"""Initialize a convolutional layer with a given number of input channels.
|
|
64
|
+
def __init__(self, c1: int = 16):
|
|
65
|
+
"""Initialize a convolutional layer with a given number of input channels.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
c1 (int): Number of input channels.
|
|
69
|
+
"""
|
|
65
70
|
super().__init__()
|
|
66
71
|
self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
|
|
67
72
|
x = torch.arange(c1, dtype=torch.float)
|
|
68
73
|
self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
|
|
69
74
|
self.c1 = c1
|
|
70
75
|
|
|
71
|
-
def forward(self, x):
|
|
76
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
72
77
|
"""Apply the DFL module to input tensor and return transformed output."""
|
|
73
78
|
b, _, a = x.shape # batch, channels, anchors
|
|
74
79
|
return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
|
|
@@ -78,9 +83,8 @@ class DFL(nn.Module):
|
|
|
78
83
|
class Proto(nn.Module):
|
|
79
84
|
"""Ultralytics YOLO models mask Proto module for segmentation models."""
|
|
80
85
|
|
|
81
|
-
def __init__(self, c1, c_=256, c2=32):
|
|
82
|
-
"""
|
|
83
|
-
Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.
|
|
86
|
+
def __init__(self, c1: int, c_: int = 256, c2: int = 32):
|
|
87
|
+
"""Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.
|
|
84
88
|
|
|
85
89
|
Args:
|
|
86
90
|
c1 (int): Input channels.
|
|
@@ -93,21 +97,19 @@ class Proto(nn.Module):
|
|
|
93
97
|
self.cv2 = Conv(c_, c_, k=3)
|
|
94
98
|
self.cv3 = Conv(c_, c2)
|
|
95
99
|
|
|
96
|
-
def forward(self, x):
|
|
100
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
97
101
|
"""Perform a forward pass through layers using an upsampled input image."""
|
|
98
102
|
return self.cv3(self.cv2(self.upsample(self.cv1(x))))
|
|
99
103
|
|
|
100
104
|
|
|
101
105
|
class HGStem(nn.Module):
|
|
102
|
-
"""
|
|
103
|
-
StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
|
|
106
|
+
"""StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
|
|
104
107
|
|
|
105
108
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
|
|
106
109
|
"""
|
|
107
110
|
|
|
108
|
-
def __init__(self, c1, cm, c2):
|
|
109
|
-
"""
|
|
110
|
-
Initialize the StemBlock of PPHGNetV2.
|
|
111
|
+
def __init__(self, c1: int, cm: int, c2: int):
|
|
112
|
+
"""Initialize the StemBlock of PPHGNetV2.
|
|
111
113
|
|
|
112
114
|
Args:
|
|
113
115
|
c1 (int): Input channels.
|
|
@@ -122,7 +124,7 @@ class HGStem(nn.Module):
|
|
|
122
124
|
self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())
|
|
123
125
|
self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)
|
|
124
126
|
|
|
125
|
-
def forward(self, x):
|
|
127
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
126
128
|
"""Forward pass of a PPHGNetV2 backbone layer."""
|
|
127
129
|
x = self.stem1(x)
|
|
128
130
|
x = F.pad(x, [0, 1, 0, 1])
|
|
@@ -137,15 +139,23 @@ class HGStem(nn.Module):
|
|
|
137
139
|
|
|
138
140
|
|
|
139
141
|
class HGBlock(nn.Module):
|
|
140
|
-
"""
|
|
141
|
-
HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
|
|
142
|
+
"""HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
|
|
142
143
|
|
|
143
144
|
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
|
|
144
145
|
"""
|
|
145
146
|
|
|
146
|
-
def __init__(
|
|
147
|
-
|
|
148
|
-
|
|
147
|
+
def __init__(
|
|
148
|
+
self,
|
|
149
|
+
c1: int,
|
|
150
|
+
cm: int,
|
|
151
|
+
c2: int,
|
|
152
|
+
k: int = 3,
|
|
153
|
+
n: int = 6,
|
|
154
|
+
lightconv: bool = False,
|
|
155
|
+
shortcut: bool = False,
|
|
156
|
+
act: nn.Module = nn.ReLU(),
|
|
157
|
+
):
|
|
158
|
+
"""Initialize HGBlock with specified parameters.
|
|
149
159
|
|
|
150
160
|
Args:
|
|
151
161
|
c1 (int): Input channels.
|
|
@@ -164,7 +174,7 @@ class HGBlock(nn.Module):
|
|
|
164
174
|
self.ec = Conv(c2 // 2, c2, 1, 1, act=act) # excitation conv
|
|
165
175
|
self.add = shortcut and c1 == c2
|
|
166
176
|
|
|
167
|
-
def forward(self, x):
|
|
177
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
168
178
|
"""Forward pass of a PPHGNetV2 backbone layer."""
|
|
169
179
|
y = [x]
|
|
170
180
|
y.extend(m(y[-1]) for m in self.m)
|
|
@@ -175,14 +185,13 @@ class HGBlock(nn.Module):
|
|
|
175
185
|
class SPP(nn.Module):
|
|
176
186
|
"""Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
|
|
177
187
|
|
|
178
|
-
def __init__(self, c1, c2, k=(5, 9, 13)):
|
|
179
|
-
"""
|
|
180
|
-
Initialize the SPP layer with input/output channels and pooling kernel sizes.
|
|
188
|
+
def __init__(self, c1: int, c2: int, k: tuple[int, ...] = (5, 9, 13)):
|
|
189
|
+
"""Initialize the SPP layer with input/output channels and pooling kernel sizes.
|
|
181
190
|
|
|
182
191
|
Args:
|
|
183
192
|
c1 (int): Input channels.
|
|
184
193
|
c2 (int): Output channels.
|
|
185
|
-
k (
|
|
194
|
+
k (tuple): Kernel sizes for max pooling.
|
|
186
195
|
"""
|
|
187
196
|
super().__init__()
|
|
188
197
|
c_ = c1 // 2 # hidden channels
|
|
@@ -190,7 +199,7 @@ class SPP(nn.Module):
|
|
|
190
199
|
self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
|
|
191
200
|
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
|
|
192
201
|
|
|
193
|
-
def forward(self, x):
|
|
202
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
194
203
|
"""Forward pass of the SPP layer, performing spatial pyramid pooling."""
|
|
195
204
|
x = self.cv1(x)
|
|
196
205
|
return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
|
|
@@ -199,9 +208,8 @@ class SPP(nn.Module):
|
|
|
199
208
|
class SPPF(nn.Module):
|
|
200
209
|
"""Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
|
|
201
210
|
|
|
202
|
-
def __init__(self, c1, c2, k=5):
|
|
203
|
-
"""
|
|
204
|
-
Initialize the SPPF layer with given input/output channels and kernel size.
|
|
211
|
+
def __init__(self, c1: int, c2: int, k: int = 5):
|
|
212
|
+
"""Initialize the SPPF layer with given input/output channels and kernel size.
|
|
205
213
|
|
|
206
214
|
Args:
|
|
207
215
|
c1 (int): Input channels.
|
|
@@ -217,7 +225,7 @@ class SPPF(nn.Module):
|
|
|
217
225
|
self.cv2 = Conv(c_ * 4, c2, 1, 1)
|
|
218
226
|
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
|
219
227
|
|
|
220
|
-
def forward(self, x):
|
|
228
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
221
229
|
"""Apply sequential pooling operations to input and return concatenated feature maps."""
|
|
222
230
|
y = [self.cv1(x)]
|
|
223
231
|
y.extend(self.m(y[-1]) for _ in range(3))
|
|
@@ -227,9 +235,8 @@ class SPPF(nn.Module):
|
|
|
227
235
|
class C1(nn.Module):
|
|
228
236
|
"""CSP Bottleneck with 1 convolution."""
|
|
229
237
|
|
|
230
|
-
def __init__(self, c1, c2, n=1):
|
|
231
|
-
"""
|
|
232
|
-
Initialize the CSP Bottleneck with 1 convolution.
|
|
238
|
+
def __init__(self, c1: int, c2: int, n: int = 1):
|
|
239
|
+
"""Initialize the CSP Bottleneck with 1 convolution.
|
|
233
240
|
|
|
234
241
|
Args:
|
|
235
242
|
c1 (int): Input channels.
|
|
@@ -240,7 +247,7 @@ class C1(nn.Module):
|
|
|
240
247
|
self.cv1 = Conv(c1, c2, 1, 1)
|
|
241
248
|
self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
|
|
242
249
|
|
|
243
|
-
def forward(self, x):
|
|
250
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
244
251
|
"""Apply convolution and residual connection to input tensor."""
|
|
245
252
|
y = self.cv1(x)
|
|
246
253
|
return self.m(y) + y
|
|
@@ -249,9 +256,8 @@ class C1(nn.Module):
|
|
|
249
256
|
class C2(nn.Module):
|
|
250
257
|
"""CSP Bottleneck with 2 convolutions."""
|
|
251
258
|
|
|
252
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
253
|
-
"""
|
|
254
|
-
Initialize a CSP Bottleneck with 2 convolutions.
|
|
259
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
260
|
+
"""Initialize a CSP Bottleneck with 2 convolutions.
|
|
255
261
|
|
|
256
262
|
Args:
|
|
257
263
|
c1 (int): Input channels.
|
|
@@ -268,7 +274,7 @@ class C2(nn.Module):
|
|
|
268
274
|
# self.attention = ChannelAttention(2 * self.c) # or SpatialAttention()
|
|
269
275
|
self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
|
|
270
276
|
|
|
271
|
-
def forward(self, x):
|
|
277
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
272
278
|
"""Forward pass through the CSP bottleneck with 2 convolutions."""
|
|
273
279
|
a, b = self.cv1(x).chunk(2, 1)
|
|
274
280
|
return self.cv2(torch.cat((self.m(a), b), 1))
|
|
@@ -277,9 +283,8 @@ class C2(nn.Module):
|
|
|
277
283
|
class C2f(nn.Module):
|
|
278
284
|
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
|
279
285
|
|
|
280
|
-
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
|
|
281
|
-
"""
|
|
282
|
-
Initialize a CSP bottleneck with 2 convolutions.
|
|
286
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
|
|
287
|
+
"""Initialize a CSP bottleneck with 2 convolutions.
|
|
283
288
|
|
|
284
289
|
Args:
|
|
285
290
|
c1 (int): Input channels.
|
|
@@ -295,13 +300,13 @@ class C2f(nn.Module):
|
|
|
295
300
|
self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2)
|
|
296
301
|
self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
|
|
297
302
|
|
|
298
|
-
def forward(self, x):
|
|
303
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
299
304
|
"""Forward pass through C2f layer."""
|
|
300
305
|
y = list(self.cv1(x).chunk(2, 1))
|
|
301
306
|
y.extend(m(y[-1]) for m in self.m)
|
|
302
307
|
return self.cv2(torch.cat(y, 1))
|
|
303
308
|
|
|
304
|
-
def forward_split(self, x):
|
|
309
|
+
def forward_split(self, x: torch.Tensor) -> torch.Tensor:
|
|
305
310
|
"""Forward pass using split() instead of chunk()."""
|
|
306
311
|
y = self.cv1(x).split((self.c, self.c), 1)
|
|
307
312
|
y = [y[0], y[1]]
|
|
@@ -312,9 +317,8 @@ class C2f(nn.Module):
|
|
|
312
317
|
class C3(nn.Module):
|
|
313
318
|
"""CSP Bottleneck with 3 convolutions."""
|
|
314
319
|
|
|
315
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
316
|
-
"""
|
|
317
|
-
Initialize the CSP Bottleneck with 3 convolutions.
|
|
320
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
321
|
+
"""Initialize the CSP Bottleneck with 3 convolutions.
|
|
318
322
|
|
|
319
323
|
Args:
|
|
320
324
|
c1 (int): Input channels.
|
|
@@ -331,7 +335,7 @@ class C3(nn.Module):
|
|
|
331
335
|
self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2)
|
|
332
336
|
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
|
|
333
337
|
|
|
334
|
-
def forward(self, x):
|
|
338
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
335
339
|
"""Forward pass through the CSP bottleneck with 3 convolutions."""
|
|
336
340
|
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
|
|
337
341
|
|
|
@@ -339,9 +343,8 @@ class C3(nn.Module):
|
|
|
339
343
|
class C3x(C3):
|
|
340
344
|
"""C3 module with cross-convolutions."""
|
|
341
345
|
|
|
342
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
343
|
-
"""
|
|
344
|
-
Initialize C3 module with cross-convolutions.
|
|
346
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
347
|
+
"""Initialize C3 module with cross-convolutions.
|
|
345
348
|
|
|
346
349
|
Args:
|
|
347
350
|
c1 (int): Input channels.
|
|
@@ -359,9 +362,8 @@ class C3x(C3):
|
|
|
359
362
|
class RepC3(nn.Module):
|
|
360
363
|
"""Rep C3."""
|
|
361
364
|
|
|
362
|
-
def __init__(self, c1, c2, n=3, e=1.0):
|
|
363
|
-
"""
|
|
364
|
-
Initialize CSP Bottleneck with a single convolution.
|
|
365
|
+
def __init__(self, c1: int, c2: int, n: int = 3, e: float = 1.0):
|
|
366
|
+
"""Initialize CSP Bottleneck with a single convolution.
|
|
365
367
|
|
|
366
368
|
Args:
|
|
367
369
|
c1 (int): Input channels.
|
|
@@ -376,7 +378,7 @@ class RepC3(nn.Module):
|
|
|
376
378
|
self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])
|
|
377
379
|
self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
|
|
378
380
|
|
|
379
|
-
def forward(self, x):
|
|
381
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
380
382
|
"""Forward pass of RepC3 module."""
|
|
381
383
|
return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
|
|
382
384
|
|
|
@@ -384,9 +386,8 @@ class RepC3(nn.Module):
|
|
|
384
386
|
class C3TR(C3):
|
|
385
387
|
"""C3 module with TransformerBlock()."""
|
|
386
388
|
|
|
387
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
388
|
-
"""
|
|
389
|
-
Initialize C3 module with TransformerBlock.
|
|
389
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
390
|
+
"""Initialize C3 module with TransformerBlock.
|
|
390
391
|
|
|
391
392
|
Args:
|
|
392
393
|
c1 (int): Input channels.
|
|
@@ -404,9 +405,8 @@ class C3TR(C3):
|
|
|
404
405
|
class C3Ghost(C3):
|
|
405
406
|
"""C3 module with GhostBottleneck()."""
|
|
406
407
|
|
|
407
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
408
|
-
"""
|
|
409
|
-
Initialize C3 module with GhostBottleneck.
|
|
408
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
409
|
+
"""Initialize C3 module with GhostBottleneck.
|
|
410
410
|
|
|
411
411
|
Args:
|
|
412
412
|
c1 (int): Input channels.
|
|
@@ -424,9 +424,8 @@ class C3Ghost(C3):
|
|
|
424
424
|
class GhostBottleneck(nn.Module):
|
|
425
425
|
"""Ghost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones."""
|
|
426
426
|
|
|
427
|
-
def __init__(self, c1, c2, k=3, s=1):
|
|
428
|
-
"""
|
|
429
|
-
Initialize Ghost Bottleneck module.
|
|
427
|
+
def __init__(self, c1: int, c2: int, k: int = 3, s: int = 1):
|
|
428
|
+
"""Initialize Ghost Bottleneck module.
|
|
430
429
|
|
|
431
430
|
Args:
|
|
432
431
|
c1 (int): Input channels.
|
|
@@ -445,7 +444,7 @@ class GhostBottleneck(nn.Module):
|
|
|
445
444
|
nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
|
|
446
445
|
)
|
|
447
446
|
|
|
448
|
-
def forward(self, x):
|
|
447
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
449
448
|
"""Apply skip connection and concatenation to input tensor."""
|
|
450
449
|
return self.conv(x) + self.shortcut(x)
|
|
451
450
|
|
|
@@ -453,16 +452,17 @@ class GhostBottleneck(nn.Module):
|
|
|
453
452
|
class Bottleneck(nn.Module):
|
|
454
453
|
"""Standard bottleneck."""
|
|
455
454
|
|
|
456
|
-
def __init__(
|
|
457
|
-
|
|
458
|
-
|
|
455
|
+
def __init__(
|
|
456
|
+
self, c1: int, c2: int, shortcut: bool = True, g: int = 1, k: tuple[int, int] = (3, 3), e: float = 0.5
|
|
457
|
+
):
|
|
458
|
+
"""Initialize a standard bottleneck module.
|
|
459
459
|
|
|
460
460
|
Args:
|
|
461
461
|
c1 (int): Input channels.
|
|
462
462
|
c2 (int): Output channels.
|
|
463
463
|
shortcut (bool): Whether to use shortcut connection.
|
|
464
464
|
g (int): Groups for convolutions.
|
|
465
|
-
k (
|
|
465
|
+
k (tuple): Kernel sizes for convolutions.
|
|
466
466
|
e (float): Expansion ratio.
|
|
467
467
|
"""
|
|
468
468
|
super().__init__()
|
|
@@ -471,7 +471,7 @@ class Bottleneck(nn.Module):
|
|
|
471
471
|
self.cv2 = Conv(c_, c2, k[1], 1, g=g)
|
|
472
472
|
self.add = shortcut and c1 == c2
|
|
473
473
|
|
|
474
|
-
def forward(self, x):
|
|
474
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
475
475
|
"""Apply bottleneck with optional shortcut connection."""
|
|
476
476
|
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
|
|
477
477
|
|
|
@@ -479,9 +479,8 @@ class Bottleneck(nn.Module):
|
|
|
479
479
|
class BottleneckCSP(nn.Module):
|
|
480
480
|
"""CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
|
|
481
481
|
|
|
482
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
483
|
-
"""
|
|
484
|
-
Initialize CSP Bottleneck.
|
|
482
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
483
|
+
"""Initialize CSP Bottleneck.
|
|
485
484
|
|
|
486
485
|
Args:
|
|
487
486
|
c1 (int): Input channels.
|
|
@@ -501,7 +500,7 @@ class BottleneckCSP(nn.Module):
|
|
|
501
500
|
self.act = nn.SiLU()
|
|
502
501
|
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
|
|
503
502
|
|
|
504
|
-
def forward(self, x):
|
|
503
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
505
504
|
"""Apply CSP bottleneck with 3 convolutions."""
|
|
506
505
|
y1 = self.cv3(self.m(self.cv1(x)))
|
|
507
506
|
y2 = self.cv2(x)
|
|
@@ -511,9 +510,8 @@ class BottleneckCSP(nn.Module):
|
|
|
511
510
|
class ResNetBlock(nn.Module):
|
|
512
511
|
"""ResNet block with standard convolution layers."""
|
|
513
512
|
|
|
514
|
-
def __init__(self, c1, c2, s=1, e=4):
|
|
515
|
-
"""
|
|
516
|
-
Initialize ResNet block.
|
|
513
|
+
def __init__(self, c1: int, c2: int, s: int = 1, e: int = 4):
|
|
514
|
+
"""Initialize ResNet block.
|
|
517
515
|
|
|
518
516
|
Args:
|
|
519
517
|
c1 (int): Input channels.
|
|
@@ -528,7 +526,7 @@ class ResNetBlock(nn.Module):
|
|
|
528
526
|
self.cv3 = Conv(c2, c3, k=1, act=False)
|
|
529
527
|
self.shortcut = nn.Sequential(Conv(c1, c3, k=1, s=s, act=False)) if s != 1 or c1 != c3 else nn.Identity()
|
|
530
528
|
|
|
531
|
-
def forward(self, x):
|
|
529
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
532
530
|
"""Forward pass through the ResNet block."""
|
|
533
531
|
return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x))
|
|
534
532
|
|
|
@@ -536,9 +534,8 @@ class ResNetBlock(nn.Module):
|
|
|
536
534
|
class ResNetLayer(nn.Module):
|
|
537
535
|
"""ResNet layer with multiple ResNet blocks."""
|
|
538
536
|
|
|
539
|
-
def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4):
|
|
540
|
-
"""
|
|
541
|
-
Initialize ResNet layer.
|
|
537
|
+
def __init__(self, c1: int, c2: int, s: int = 1, is_first: bool = False, n: int = 1, e: int = 4):
|
|
538
|
+
"""Initialize ResNet layer.
|
|
542
539
|
|
|
543
540
|
Args:
|
|
544
541
|
c1 (int): Input channels.
|
|
@@ -560,7 +557,7 @@ class ResNetLayer(nn.Module):
|
|
|
560
557
|
blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
|
|
561
558
|
self.layer = nn.Sequential(*blocks)
|
|
562
559
|
|
|
563
|
-
def forward(self, x):
|
|
560
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
564
561
|
"""Forward pass through the ResNet layer."""
|
|
565
562
|
return self.layer(x)
|
|
566
563
|
|
|
@@ -568,9 +565,8 @@ class ResNetLayer(nn.Module):
|
|
|
568
565
|
class MaxSigmoidAttnBlock(nn.Module):
|
|
569
566
|
"""Max Sigmoid attention block."""
|
|
570
567
|
|
|
571
|
-
def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
|
|
572
|
-
"""
|
|
573
|
-
Initialize MaxSigmoidAttnBlock.
|
|
568
|
+
def __init__(self, c1: int, c2: int, nh: int = 1, ec: int = 128, gc: int = 512, scale: bool = False):
|
|
569
|
+
"""Initialize MaxSigmoidAttnBlock.
|
|
574
570
|
|
|
575
571
|
Args:
|
|
576
572
|
c1 (int): Input channels.
|
|
@@ -589,9 +585,8 @@ class MaxSigmoidAttnBlock(nn.Module):
|
|
|
589
585
|
self.proj_conv = Conv(c1, c2, k=3, s=1, act=False)
|
|
590
586
|
self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
|
|
591
587
|
|
|
592
|
-
def forward(self, x, guide):
|
|
593
|
-
"""
|
|
594
|
-
Forward pass of MaxSigmoidAttnBlock.
|
|
588
|
+
def forward(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
|
|
589
|
+
"""Forward pass of MaxSigmoidAttnBlock.
|
|
595
590
|
|
|
596
591
|
Args:
|
|
597
592
|
x (torch.Tensor): Input tensor.
|
|
@@ -622,9 +617,19 @@ class MaxSigmoidAttnBlock(nn.Module):
|
|
|
622
617
|
class C2fAttn(nn.Module):
|
|
623
618
|
"""C2f module with an additional attn module."""
|
|
624
619
|
|
|
625
|
-
def __init__(
|
|
626
|
-
|
|
627
|
-
|
|
620
|
+
def __init__(
|
|
621
|
+
self,
|
|
622
|
+
c1: int,
|
|
623
|
+
c2: int,
|
|
624
|
+
n: int = 1,
|
|
625
|
+
ec: int = 128,
|
|
626
|
+
nh: int = 1,
|
|
627
|
+
gc: int = 512,
|
|
628
|
+
shortcut: bool = False,
|
|
629
|
+
g: int = 1,
|
|
630
|
+
e: float = 0.5,
|
|
631
|
+
):
|
|
632
|
+
"""Initialize C2f module with attention mechanism.
|
|
628
633
|
|
|
629
634
|
Args:
|
|
630
635
|
c1 (int): Input channels.
|
|
@@ -644,9 +649,8 @@ class C2fAttn(nn.Module):
|
|
|
644
649
|
self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
|
|
645
650
|
self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
|
|
646
651
|
|
|
647
|
-
def forward(self, x, guide):
|
|
648
|
-
"""
|
|
649
|
-
Forward pass through C2f layer with attention.
|
|
652
|
+
def forward(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
|
|
653
|
+
"""Forward pass through C2f layer with attention.
|
|
650
654
|
|
|
651
655
|
Args:
|
|
652
656
|
x (torch.Tensor): Input tensor.
|
|
@@ -660,9 +664,8 @@ class C2fAttn(nn.Module):
|
|
|
660
664
|
y.append(self.attn(y[-1], guide))
|
|
661
665
|
return self.cv2(torch.cat(y, 1))
|
|
662
666
|
|
|
663
|
-
def forward_split(self, x, guide):
|
|
664
|
-
"""
|
|
665
|
-
Forward pass using split() instead of chunk().
|
|
667
|
+
def forward_split(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
|
|
668
|
+
"""Forward pass using split() instead of chunk().
|
|
666
669
|
|
|
667
670
|
Args:
|
|
668
671
|
x (torch.Tensor): Input tensor.
|
|
@@ -680,9 +683,10 @@ class C2fAttn(nn.Module):
|
|
|
680
683
|
class ImagePoolingAttn(nn.Module):
|
|
681
684
|
"""ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
|
|
682
685
|
|
|
683
|
-
def __init__(
|
|
684
|
-
|
|
685
|
-
|
|
686
|
+
def __init__(
|
|
687
|
+
self, ec: int = 256, ch: tuple[int, ...] = (), ct: int = 512, nh: int = 8, k: int = 3, scale: bool = False
|
|
688
|
+
):
|
|
689
|
+
"""Initialize ImagePoolingAttn module.
|
|
686
690
|
|
|
687
691
|
Args:
|
|
688
692
|
ec (int): Embedding channels.
|
|
@@ -708,12 +712,11 @@ class ImagePoolingAttn(nn.Module):
|
|
|
708
712
|
self.hc = ec // nh
|
|
709
713
|
self.k = k
|
|
710
714
|
|
|
711
|
-
def forward(self, x, text):
|
|
712
|
-
"""
|
|
713
|
-
Forward pass of ImagePoolingAttn.
|
|
715
|
+
def forward(self, x: list[torch.Tensor], text: torch.Tensor) -> torch.Tensor:
|
|
716
|
+
"""Forward pass of ImagePoolingAttn.
|
|
714
717
|
|
|
715
718
|
Args:
|
|
716
|
-
x (
|
|
719
|
+
x (list[torch.Tensor]): List of input feature maps.
|
|
717
720
|
text (torch.Tensor): Text embeddings.
|
|
718
721
|
|
|
719
722
|
Returns:
|
|
@@ -752,9 +755,8 @@ class ContrastiveHead(nn.Module):
|
|
|
752
755
|
self.bias = nn.Parameter(torch.tensor([-10.0]))
|
|
753
756
|
self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
|
|
754
757
|
|
|
755
|
-
def forward(self, x, w):
|
|
756
|
-
"""
|
|
757
|
-
Forward function of contrastive learning.
|
|
758
|
+
def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
|
|
759
|
+
"""Forward function of contrastive learning.
|
|
758
760
|
|
|
759
761
|
Args:
|
|
760
762
|
x (torch.Tensor): Image features.
|
|
@@ -770,16 +772,14 @@ class ContrastiveHead(nn.Module):
|
|
|
770
772
|
|
|
771
773
|
|
|
772
774
|
class BNContrastiveHead(nn.Module):
|
|
773
|
-
"""
|
|
774
|
-
Batch Norm Contrastive Head using batch norm instead of l2-normalization.
|
|
775
|
+
"""Batch Norm Contrastive Head using batch norm instead of l2-normalization.
|
|
775
776
|
|
|
776
777
|
Args:
|
|
777
778
|
embed_dims (int): Embed dimensions of text and image features.
|
|
778
779
|
"""
|
|
779
780
|
|
|
780
781
|
def __init__(self, embed_dims: int):
|
|
781
|
-
"""
|
|
782
|
-
Initialize BNContrastiveHead.
|
|
782
|
+
"""Initialize BNContrastiveHead.
|
|
783
783
|
|
|
784
784
|
Args:
|
|
785
785
|
embed_dims (int): Embedding dimensions for features.
|
|
@@ -798,17 +798,12 @@ class BNContrastiveHead(nn.Module):
|
|
|
798
798
|
del self.logit_scale
|
|
799
799
|
self.forward = self.forward_fuse
|
|
800
800
|
|
|
801
|
-
def forward_fuse(self, x, w):
|
|
802
|
-
"""
|
|
803
|
-
Passes input out unchanged.
|
|
804
|
-
|
|
805
|
-
TODO: Update or remove?
|
|
806
|
-
"""
|
|
801
|
+
def forward_fuse(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
|
|
802
|
+
"""Passes input out unchanged."""
|
|
807
803
|
return x
|
|
808
804
|
|
|
809
|
-
def forward(self, x, w):
|
|
810
|
-
"""
|
|
811
|
-
Forward function of contrastive learning with batch normalization.
|
|
805
|
+
def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
|
|
806
|
+
"""Forward function of contrastive learning with batch normalization.
|
|
812
807
|
|
|
813
808
|
Args:
|
|
814
809
|
x (torch.Tensor): Image features.
|
|
@@ -827,16 +822,17 @@ class BNContrastiveHead(nn.Module):
|
|
|
827
822
|
class RepBottleneck(Bottleneck):
|
|
828
823
|
"""Rep bottleneck."""
|
|
829
824
|
|
|
830
|
-
def __init__(
|
|
831
|
-
|
|
832
|
-
|
|
825
|
+
def __init__(
|
|
826
|
+
self, c1: int, c2: int, shortcut: bool = True, g: int = 1, k: tuple[int, int] = (3, 3), e: float = 0.5
|
|
827
|
+
):
|
|
828
|
+
"""Initialize RepBottleneck.
|
|
833
829
|
|
|
834
830
|
Args:
|
|
835
831
|
c1 (int): Input channels.
|
|
836
832
|
c2 (int): Output channels.
|
|
837
833
|
shortcut (bool): Whether to use shortcut connection.
|
|
838
834
|
g (int): Groups for convolutions.
|
|
839
|
-
k (
|
|
835
|
+
k (tuple): Kernel sizes for convolutions.
|
|
840
836
|
e (float): Expansion ratio.
|
|
841
837
|
"""
|
|
842
838
|
super().__init__(c1, c2, shortcut, g, k, e)
|
|
@@ -847,9 +843,8 @@ class RepBottleneck(Bottleneck):
|
|
|
847
843
|
class RepCSP(C3):
|
|
848
844
|
"""Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""
|
|
849
845
|
|
|
850
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
|
851
|
-
"""
|
|
852
|
-
Initialize RepCSP layer.
|
|
846
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
|
|
847
|
+
"""Initialize RepCSP layer.
|
|
853
848
|
|
|
854
849
|
Args:
|
|
855
850
|
c1 (int): Input channels.
|
|
@@ -867,9 +862,8 @@ class RepCSP(C3):
|
|
|
867
862
|
class RepNCSPELAN4(nn.Module):
|
|
868
863
|
"""CSP-ELAN."""
|
|
869
864
|
|
|
870
|
-
def __init__(self, c1, c2, c3, c4, n=1):
|
|
871
|
-
"""
|
|
872
|
-
Initialize CSP-ELAN layer.
|
|
865
|
+
def __init__(self, c1: int, c2: int, c3: int, c4: int, n: int = 1):
|
|
866
|
+
"""Initialize CSP-ELAN layer.
|
|
873
867
|
|
|
874
868
|
Args:
|
|
875
869
|
c1 (int): Input channels.
|
|
@@ -885,13 +879,13 @@ class RepNCSPELAN4(nn.Module):
|
|
|
885
879
|
self.cv3 = nn.Sequential(RepCSP(c4, c4, n), Conv(c4, c4, 3, 1))
|
|
886
880
|
self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)
|
|
887
881
|
|
|
888
|
-
def forward(self, x):
|
|
882
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
889
883
|
"""Forward pass through RepNCSPELAN4 layer."""
|
|
890
884
|
y = list(self.cv1(x).chunk(2, 1))
|
|
891
885
|
y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
|
|
892
886
|
return self.cv4(torch.cat(y, 1))
|
|
893
887
|
|
|
894
|
-
def forward_split(self, x):
|
|
888
|
+
def forward_split(self, x: torch.Tensor) -> torch.Tensor:
|
|
895
889
|
"""Forward pass using split() instead of chunk()."""
|
|
896
890
|
y = list(self.cv1(x).split((self.c, self.c), 1))
|
|
897
891
|
y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
|
|
@@ -901,9 +895,8 @@ class RepNCSPELAN4(nn.Module):
|
|
|
901
895
|
class ELAN1(RepNCSPELAN4):
|
|
902
896
|
"""ELAN1 module with 4 convolutions."""
|
|
903
897
|
|
|
904
|
-
def __init__(self, c1, c2, c3, c4):
|
|
905
|
-
"""
|
|
906
|
-
Initialize ELAN1 layer.
|
|
898
|
+
def __init__(self, c1: int, c2: int, c3: int, c4: int):
|
|
899
|
+
"""Initialize ELAN1 layer.
|
|
907
900
|
|
|
908
901
|
Args:
|
|
909
902
|
c1 (int): Input channels.
|
|
@@ -922,9 +915,8 @@ class ELAN1(RepNCSPELAN4):
|
|
|
922
915
|
class AConv(nn.Module):
|
|
923
916
|
"""AConv."""
|
|
924
917
|
|
|
925
|
-
def __init__(self, c1, c2):
|
|
926
|
-
"""
|
|
927
|
-
Initialize AConv module.
|
|
918
|
+
def __init__(self, c1: int, c2: int):
|
|
919
|
+
"""Initialize AConv module.
|
|
928
920
|
|
|
929
921
|
Args:
|
|
930
922
|
c1 (int): Input channels.
|
|
@@ -933,7 +925,7 @@ class AConv(nn.Module):
|
|
|
933
925
|
super().__init__()
|
|
934
926
|
self.cv1 = Conv(c1, c2, 3, 2, 1)
|
|
935
927
|
|
|
936
|
-
def forward(self, x):
|
|
928
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
937
929
|
"""Forward pass through AConv layer."""
|
|
938
930
|
x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
|
|
939
931
|
return self.cv1(x)
|
|
@@ -942,9 +934,8 @@ class AConv(nn.Module):
|
|
|
942
934
|
class ADown(nn.Module):
|
|
943
935
|
"""ADown."""
|
|
944
936
|
|
|
945
|
-
def __init__(self, c1, c2):
|
|
946
|
-
"""
|
|
947
|
-
Initialize ADown module.
|
|
937
|
+
def __init__(self, c1: int, c2: int):
|
|
938
|
+
"""Initialize ADown module.
|
|
948
939
|
|
|
949
940
|
Args:
|
|
950
941
|
c1 (int): Input channels.
|
|
@@ -955,7 +946,7 @@ class ADown(nn.Module):
|
|
|
955
946
|
self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
|
|
956
947
|
self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
|
|
957
948
|
|
|
958
|
-
def forward(self, x):
|
|
949
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
959
950
|
"""Forward pass through ADown layer."""
|
|
960
951
|
x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
|
|
961
952
|
x1, x2 = x.chunk(2, 1)
|
|
@@ -968,9 +959,8 @@ class ADown(nn.Module):
|
|
|
968
959
|
class SPPELAN(nn.Module):
|
|
969
960
|
"""SPP-ELAN."""
|
|
970
961
|
|
|
971
|
-
def __init__(self, c1, c2, c3, k=5):
|
|
972
|
-
"""
|
|
973
|
-
Initialize SPP-ELAN block.
|
|
962
|
+
def __init__(self, c1: int, c2: int, c3: int, k: int = 5):
|
|
963
|
+
"""Initialize SPP-ELAN block.
|
|
974
964
|
|
|
975
965
|
Args:
|
|
976
966
|
c1 (int): Input channels.
|
|
@@ -986,7 +976,7 @@ class SPPELAN(nn.Module):
|
|
|
986
976
|
self.cv4 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
|
987
977
|
self.cv5 = Conv(4 * c3, c2, 1, 1)
|
|
988
978
|
|
|
989
|
-
def forward(self, x):
|
|
979
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
990
980
|
"""Forward pass through SPPELAN layer."""
|
|
991
981
|
y = [self.cv1(x)]
|
|
992
982
|
y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4])
|
|
@@ -996,13 +986,12 @@ class SPPELAN(nn.Module):
|
|
|
996
986
|
class CBLinear(nn.Module):
|
|
997
987
|
"""CBLinear."""
|
|
998
988
|
|
|
999
|
-
def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):
|
|
1000
|
-
"""
|
|
1001
|
-
Initialize CBLinear module.
|
|
989
|
+
def __init__(self, c1: int, c2s: list[int], k: int = 1, s: int = 1, p: int | None = None, g: int = 1):
|
|
990
|
+
"""Initialize CBLinear module.
|
|
1002
991
|
|
|
1003
992
|
Args:
|
|
1004
993
|
c1 (int): Input channels.
|
|
1005
|
-
c2s (
|
|
994
|
+
c2s (list[int]): List of output channel sizes.
|
|
1006
995
|
k (int): Kernel size.
|
|
1007
996
|
s (int): Stride.
|
|
1008
997
|
p (int | None): Padding.
|
|
@@ -1012,7 +1001,7 @@ class CBLinear(nn.Module):
|
|
|
1012
1001
|
self.c2s = c2s
|
|
1013
1002
|
self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
|
|
1014
1003
|
|
|
1015
|
-
def forward(self, x):
|
|
1004
|
+
def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
|
|
1016
1005
|
"""Forward pass through CBLinear layer."""
|
|
1017
1006
|
return self.conv(x).split(self.c2s, dim=1)
|
|
1018
1007
|
|
|
@@ -1020,22 +1009,20 @@ class CBLinear(nn.Module):
|
|
|
1020
1009
|
class CBFuse(nn.Module):
|
|
1021
1010
|
"""CBFuse."""
|
|
1022
1011
|
|
|
1023
|
-
def __init__(self, idx):
|
|
1024
|
-
"""
|
|
1025
|
-
Initialize CBFuse module.
|
|
1012
|
+
def __init__(self, idx: list[int]):
|
|
1013
|
+
"""Initialize CBFuse module.
|
|
1026
1014
|
|
|
1027
1015
|
Args:
|
|
1028
|
-
idx (
|
|
1016
|
+
idx (list[int]): Indices for feature selection.
|
|
1029
1017
|
"""
|
|
1030
1018
|
super().__init__()
|
|
1031
1019
|
self.idx = idx
|
|
1032
1020
|
|
|
1033
|
-
def forward(self, xs):
|
|
1034
|
-
"""
|
|
1035
|
-
Forward pass through CBFuse layer.
|
|
1021
|
+
def forward(self, xs: list[torch.Tensor]) -> torch.Tensor:
|
|
1022
|
+
"""Forward pass through CBFuse layer.
|
|
1036
1023
|
|
|
1037
1024
|
Args:
|
|
1038
|
-
xs (
|
|
1025
|
+
xs (list[torch.Tensor]): List of input tensors.
|
|
1039
1026
|
|
|
1040
1027
|
Returns:
|
|
1041
1028
|
(torch.Tensor): Fused output tensor.
|
|
@@ -1048,9 +1035,8 @@ class CBFuse(nn.Module):
|
|
|
1048
1035
|
class C3f(nn.Module):
|
|
1049
1036
|
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
|
1050
1037
|
|
|
1051
|
-
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
|
|
1052
|
-
"""
|
|
1053
|
-
Initialize CSP bottleneck layer with two convolutions.
|
|
1038
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
|
|
1039
|
+
"""Initialize CSP bottleneck layer with two convolutions.
|
|
1054
1040
|
|
|
1055
1041
|
Args:
|
|
1056
1042
|
c1 (int): Input channels.
|
|
@@ -1067,7 +1053,7 @@ class C3f(nn.Module):
|
|
|
1067
1053
|
self.cv3 = Conv((2 + n) * c_, c2, 1) # optional act=FReLU(c2)
|
|
1068
1054
|
self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
|
|
1069
1055
|
|
|
1070
|
-
def forward(self, x):
|
|
1056
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1071
1057
|
"""Forward pass through C3f layer."""
|
|
1072
1058
|
y = [self.cv2(x), self.cv1(x)]
|
|
1073
1059
|
y.extend(m(y[-1]) for m in self.m)
|
|
@@ -1077,9 +1063,10 @@ class C3f(nn.Module):
|
|
|
1077
1063
|
class C3k2(C2f):
|
|
1078
1064
|
"""Faster Implementation of CSP Bottleneck with 2 convolutions."""
|
|
1079
1065
|
|
|
1080
|
-
def __init__(
|
|
1081
|
-
|
|
1082
|
-
|
|
1066
|
+
def __init__(
|
|
1067
|
+
self, c1: int, c2: int, n: int = 1, c3k: bool = False, e: float = 0.5, g: int = 1, shortcut: bool = True
|
|
1068
|
+
):
|
|
1069
|
+
"""Initialize C3k2 module.
|
|
1083
1070
|
|
|
1084
1071
|
Args:
|
|
1085
1072
|
c1 (int): Input channels.
|
|
@@ -1099,9 +1086,8 @@ class C3k2(C2f):
|
|
|
1099
1086
|
class C3k(C3):
|
|
1100
1087
|
"""C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
|
|
1101
1088
|
|
|
1102
|
-
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
|
|
1103
|
-
"""
|
|
1104
|
-
Initialize C3k module.
|
|
1089
|
+
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3):
|
|
1090
|
+
"""Initialize C3k module.
|
|
1105
1091
|
|
|
1106
1092
|
Args:
|
|
1107
1093
|
c1 (int): Input channels.
|
|
@@ -1121,9 +1107,8 @@ class C3k(C3):
|
|
|
1121
1107
|
class RepVGGDW(torch.nn.Module):
|
|
1122
1108
|
"""RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
|
|
1123
1109
|
|
|
1124
|
-
def __init__(self, ed) -> None:
|
|
1125
|
-
"""
|
|
1126
|
-
Initialize RepVGGDW module.
|
|
1110
|
+
def __init__(self, ed: int) -> None:
|
|
1111
|
+
"""Initialize RepVGGDW module.
|
|
1127
1112
|
|
|
1128
1113
|
Args:
|
|
1129
1114
|
ed (int): Input and output channels.
|
|
@@ -1134,9 +1119,8 @@ class RepVGGDW(torch.nn.Module):
|
|
|
1134
1119
|
self.dim = ed
|
|
1135
1120
|
self.act = nn.SiLU()
|
|
1136
1121
|
|
|
1137
|
-
def forward(self, x):
|
|
1138
|
-
"""
|
|
1139
|
-
Perform a forward pass of the RepVGGDW block.
|
|
1122
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1123
|
+
"""Perform a forward pass of the RepVGGDW block.
|
|
1140
1124
|
|
|
1141
1125
|
Args:
|
|
1142
1126
|
x (torch.Tensor): Input tensor.
|
|
@@ -1146,9 +1130,8 @@ class RepVGGDW(torch.nn.Module):
|
|
|
1146
1130
|
"""
|
|
1147
1131
|
return self.act(self.conv(x) + self.conv1(x))
|
|
1148
1132
|
|
|
1149
|
-
def forward_fuse(self, x):
|
|
1150
|
-
"""
|
|
1151
|
-
Perform a forward pass of the RepVGGDW block without fusing the convolutions.
|
|
1133
|
+
def forward_fuse(self, x: torch.Tensor) -> torch.Tensor:
|
|
1134
|
+
"""Perform a forward pass of the RepVGGDW block without fusing the convolutions.
|
|
1152
1135
|
|
|
1153
1136
|
Args:
|
|
1154
1137
|
x (torch.Tensor): Input tensor.
|
|
@@ -1160,8 +1143,7 @@ class RepVGGDW(torch.nn.Module):
|
|
|
1160
1143
|
|
|
1161
1144
|
@torch.no_grad()
|
|
1162
1145
|
def fuse(self):
|
|
1163
|
-
"""
|
|
1164
|
-
Fuse the convolutional layers in the RepVGGDW block.
|
|
1146
|
+
"""Fuse the convolutional layers in the RepVGGDW block.
|
|
1165
1147
|
|
|
1166
1148
|
This method fuses the convolutional layers and updates the weights and biases accordingly.
|
|
1167
1149
|
"""
|
|
@@ -1186,8 +1168,7 @@ class RepVGGDW(torch.nn.Module):
|
|
|
1186
1168
|
|
|
1187
1169
|
|
|
1188
1170
|
class CIB(nn.Module):
|
|
1189
|
-
"""
|
|
1190
|
-
Conditional Identity Block (CIB) module.
|
|
1171
|
+
"""Conditional Identity Block (CIB) module.
|
|
1191
1172
|
|
|
1192
1173
|
Args:
|
|
1193
1174
|
c1 (int): Number of input channels.
|
|
@@ -1197,9 +1178,8 @@ class CIB(nn.Module):
|
|
|
1197
1178
|
lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
|
|
1198
1179
|
"""
|
|
1199
1180
|
|
|
1200
|
-
def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
|
|
1201
|
-
"""
|
|
1202
|
-
Initialize the CIB module.
|
|
1181
|
+
def __init__(self, c1: int, c2: int, shortcut: bool = True, e: float = 0.5, lk: bool = False):
|
|
1182
|
+
"""Initialize the CIB module.
|
|
1203
1183
|
|
|
1204
1184
|
Args:
|
|
1205
1185
|
c1 (int): Input channels.
|
|
@@ -1220,9 +1200,8 @@ class CIB(nn.Module):
|
|
|
1220
1200
|
|
|
1221
1201
|
self.add = shortcut and c1 == c2
|
|
1222
1202
|
|
|
1223
|
-
def forward(self, x):
|
|
1224
|
-
"""
|
|
1225
|
-
Forward pass of the CIB module.
|
|
1203
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1204
|
+
"""Forward pass of the CIB module.
|
|
1226
1205
|
|
|
1227
1206
|
Args:
|
|
1228
1207
|
x (torch.Tensor): Input tensor.
|
|
@@ -1234,8 +1213,7 @@ class CIB(nn.Module):
|
|
|
1234
1213
|
|
|
1235
1214
|
|
|
1236
1215
|
class C2fCIB(C2f):
|
|
1237
|
-
"""
|
|
1238
|
-
C2fCIB class represents a convolutional block with C2f and CIB modules.
|
|
1216
|
+
"""C2fCIB class represents a convolutional block with C2f and CIB modules.
|
|
1239
1217
|
|
|
1240
1218
|
Args:
|
|
1241
1219
|
c1 (int): Number of input channels.
|
|
@@ -1247,9 +1225,10 @@ class C2fCIB(C2f):
|
|
|
1247
1225
|
e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
|
|
1248
1226
|
"""
|
|
1249
1227
|
|
|
1250
|
-
def __init__(
|
|
1251
|
-
|
|
1252
|
-
|
|
1228
|
+
def __init__(
|
|
1229
|
+
self, c1: int, c2: int, n: int = 1, shortcut: bool = False, lk: bool = False, g: int = 1, e: float = 0.5
|
|
1230
|
+
):
|
|
1231
|
+
"""Initialize C2fCIB module.
|
|
1253
1232
|
|
|
1254
1233
|
Args:
|
|
1255
1234
|
c1 (int): Input channels.
|
|
@@ -1265,8 +1244,7 @@ class C2fCIB(C2f):
|
|
|
1265
1244
|
|
|
1266
1245
|
|
|
1267
1246
|
class Attention(nn.Module):
|
|
1268
|
-
"""
|
|
1269
|
-
Attention module that performs self-attention on the input tensor.
|
|
1247
|
+
"""Attention module that performs self-attention on the input tensor.
|
|
1270
1248
|
|
|
1271
1249
|
Args:
|
|
1272
1250
|
dim (int): The input tensor dimension.
|
|
@@ -1283,9 +1261,8 @@ class Attention(nn.Module):
|
|
|
1283
1261
|
pe (Conv): Convolutional layer for positional encoding.
|
|
1284
1262
|
"""
|
|
1285
1263
|
|
|
1286
|
-
def __init__(self, dim, num_heads=8, attn_ratio=0.5):
|
|
1287
|
-
"""
|
|
1288
|
-
Initialize multi-head attention module.
|
|
1264
|
+
def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5):
|
|
1265
|
+
"""Initialize multi-head attention module.
|
|
1289
1266
|
|
|
1290
1267
|
Args:
|
|
1291
1268
|
dim (int): Input dimension.
|
|
@@ -1303,9 +1280,8 @@ class Attention(nn.Module):
|
|
|
1303
1280
|
self.proj = Conv(dim, dim, 1, act=False)
|
|
1304
1281
|
self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
|
|
1305
1282
|
|
|
1306
|
-
def forward(self, x):
|
|
1307
|
-
"""
|
|
1308
|
-
Forward pass of the Attention module.
|
|
1283
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1284
|
+
"""Forward pass of the Attention module.
|
|
1309
1285
|
|
|
1310
1286
|
Args:
|
|
1311
1287
|
x (torch.Tensor): The input tensor.
|
|
@@ -1328,8 +1304,7 @@ class Attention(nn.Module):
|
|
|
1328
1304
|
|
|
1329
1305
|
|
|
1330
1306
|
class PSABlock(nn.Module):
|
|
1331
|
-
"""
|
|
1332
|
-
PSABlock class implementing a Position-Sensitive Attention block for neural networks.
|
|
1307
|
+
"""PSABlock class implementing a Position-Sensitive Attention block for neural networks.
|
|
1333
1308
|
|
|
1334
1309
|
This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
|
|
1335
1310
|
with optional shortcut connections.
|
|
@@ -1349,9 +1324,8 @@ class PSABlock(nn.Module):
|
|
|
1349
1324
|
>>> output_tensor = psablock(input_tensor)
|
|
1350
1325
|
"""
|
|
1351
1326
|
|
|
1352
|
-
def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
|
|
1353
|
-
"""
|
|
1354
|
-
Initialize the PSABlock.
|
|
1327
|
+
def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True) -> None:
|
|
1328
|
+
"""Initialize the PSABlock.
|
|
1355
1329
|
|
|
1356
1330
|
Args:
|
|
1357
1331
|
c (int): Input and output channels.
|
|
@@ -1365,9 +1339,8 @@ class PSABlock(nn.Module):
|
|
|
1365
1339
|
self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
|
|
1366
1340
|
self.add = shortcut
|
|
1367
1341
|
|
|
1368
|
-
def forward(self, x):
|
|
1369
|
-
"""
|
|
1370
|
-
Execute a forward pass through PSABlock.
|
|
1342
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1343
|
+
"""Execute a forward pass through PSABlock.
|
|
1371
1344
|
|
|
1372
1345
|
Args:
|
|
1373
1346
|
x (torch.Tensor): Input tensor.
|
|
@@ -1381,8 +1354,7 @@ class PSABlock(nn.Module):
|
|
|
1381
1354
|
|
|
1382
1355
|
|
|
1383
1356
|
class PSA(nn.Module):
|
|
1384
|
-
"""
|
|
1385
|
-
PSA class for implementing Position-Sensitive Attention in neural networks.
|
|
1357
|
+
"""PSA class for implementing Position-Sensitive Attention in neural networks.
|
|
1386
1358
|
|
|
1387
1359
|
This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
|
|
1388
1360
|
input tensors, enhancing feature extraction and processing capabilities.
|
|
@@ -1404,9 +1376,8 @@ class PSA(nn.Module):
|
|
|
1404
1376
|
>>> output_tensor = psa.forward(input_tensor)
|
|
1405
1377
|
"""
|
|
1406
1378
|
|
|
1407
|
-
def __init__(self, c1, c2, e=0.5):
|
|
1408
|
-
"""
|
|
1409
|
-
Initialize PSA module.
|
|
1379
|
+
def __init__(self, c1: int, c2: int, e: float = 0.5):
|
|
1380
|
+
"""Initialize PSA module.
|
|
1410
1381
|
|
|
1411
1382
|
Args:
|
|
1412
1383
|
c1 (int): Input channels.
|
|
@@ -1422,9 +1393,8 @@ class PSA(nn.Module):
|
|
|
1422
1393
|
self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
|
|
1423
1394
|
self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
|
|
1424
1395
|
|
|
1425
|
-
def forward(self, x):
|
|
1426
|
-
"""
|
|
1427
|
-
Execute forward pass in PSA module.
|
|
1396
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1397
|
+
"""Execute forward pass in PSA module.
|
|
1428
1398
|
|
|
1429
1399
|
Args:
|
|
1430
1400
|
x (torch.Tensor): Input tensor.
|
|
@@ -1439,8 +1409,7 @@ class PSA(nn.Module):
|
|
|
1439
1409
|
|
|
1440
1410
|
|
|
1441
1411
|
class C2PSA(nn.Module):
|
|
1442
|
-
"""
|
|
1443
|
-
C2PSA module with attention mechanism for enhanced feature extraction and processing.
|
|
1412
|
+
"""C2PSA module with attention mechanism for enhanced feature extraction and processing.
|
|
1444
1413
|
|
|
1445
1414
|
This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
|
|
1446
1415
|
capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
|
|
@@ -1454,18 +1423,17 @@ class C2PSA(nn.Module):
|
|
|
1454
1423
|
Methods:
|
|
1455
1424
|
forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
|
|
1456
1425
|
|
|
1457
|
-
Notes:
|
|
1458
|
-
This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
|
|
1459
|
-
|
|
1460
1426
|
Examples:
|
|
1461
1427
|
>>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
|
|
1462
1428
|
>>> input_tensor = torch.randn(1, 256, 64, 64)
|
|
1463
1429
|
>>> output_tensor = c2psa(input_tensor)
|
|
1430
|
+
|
|
1431
|
+
Notes:
|
|
1432
|
+
This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
|
|
1464
1433
|
"""
|
|
1465
1434
|
|
|
1466
|
-
def __init__(self, c1, c2, n=1, e=0.5):
|
|
1467
|
-
"""
|
|
1468
|
-
Initialize C2PSA module.
|
|
1435
|
+
def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
|
|
1436
|
+
"""Initialize C2PSA module.
|
|
1469
1437
|
|
|
1470
1438
|
Args:
|
|
1471
1439
|
c1 (int): Input channels.
|
|
@@ -1481,9 +1449,8 @@ class C2PSA(nn.Module):
|
|
|
1481
1449
|
|
|
1482
1450
|
self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
|
|
1483
1451
|
|
|
1484
|
-
def forward(self, x):
|
|
1485
|
-
"""
|
|
1486
|
-
Process the input tensor through a series of PSA blocks.
|
|
1452
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1453
|
+
"""Process the input tensor through a series of PSA blocks.
|
|
1487
1454
|
|
|
1488
1455
|
Args:
|
|
1489
1456
|
x (torch.Tensor): Input tensor.
|
|
@@ -1497,10 +1464,10 @@ class C2PSA(nn.Module):
|
|
|
1497
1464
|
|
|
1498
1465
|
|
|
1499
1466
|
class C2fPSA(C2f):
|
|
1500
|
-
"""
|
|
1501
|
-
C2fPSA module with enhanced feature extraction using PSA blocks.
|
|
1467
|
+
"""C2fPSA module with enhanced feature extraction using PSA blocks.
|
|
1502
1468
|
|
|
1503
|
-
This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature
|
|
1469
|
+
This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature
|
|
1470
|
+
extraction.
|
|
1504
1471
|
|
|
1505
1472
|
Attributes:
|
|
1506
1473
|
c (int): Number of hidden channels.
|
|
@@ -1521,9 +1488,8 @@ class C2fPSA(C2f):
|
|
|
1521
1488
|
>>> print(output.shape)
|
|
1522
1489
|
"""
|
|
1523
1490
|
|
|
1524
|
-
def __init__(self, c1, c2, n=1, e=0.5):
|
|
1525
|
-
"""
|
|
1526
|
-
Initialize C2fPSA module.
|
|
1491
|
+
def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
|
|
1492
|
+
"""Initialize C2fPSA module.
|
|
1527
1493
|
|
|
1528
1494
|
Args:
|
|
1529
1495
|
c1 (int): Input channels.
|
|
@@ -1537,8 +1503,7 @@ class C2fPSA(C2f):
|
|
|
1537
1503
|
|
|
1538
1504
|
|
|
1539
1505
|
class SCDown(nn.Module):
|
|
1540
|
-
"""
|
|
1541
|
-
SCDown module for downsampling with separable convolutions.
|
|
1506
|
+
"""SCDown module for downsampling with separable convolutions.
|
|
1542
1507
|
|
|
1543
1508
|
This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
|
|
1544
1509
|
efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
|
|
@@ -1560,9 +1525,8 @@ class SCDown(nn.Module):
|
|
|
1560
1525
|
torch.Size([1, 128, 64, 64])
|
|
1561
1526
|
"""
|
|
1562
1527
|
|
|
1563
|
-
def __init__(self, c1, c2, k, s):
|
|
1564
|
-
"""
|
|
1565
|
-
Initialize SCDown module.
|
|
1528
|
+
def __init__(self, c1: int, c2: int, k: int, s: int):
|
|
1529
|
+
"""Initialize SCDown module.
|
|
1566
1530
|
|
|
1567
1531
|
Args:
|
|
1568
1532
|
c1 (int): Input channels.
|
|
@@ -1574,9 +1538,8 @@ class SCDown(nn.Module):
|
|
|
1574
1538
|
self.cv1 = Conv(c1, c2, 1, 1)
|
|
1575
1539
|
self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
|
|
1576
1540
|
|
|
1577
|
-
def forward(self, x):
|
|
1578
|
-
"""
|
|
1579
|
-
Apply convolution and downsampling to the input tensor.
|
|
1541
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1542
|
+
"""Apply convolution and downsampling to the input tensor.
|
|
1580
1543
|
|
|
1581
1544
|
Args:
|
|
1582
1545
|
x (torch.Tensor): Input tensor.
|
|
@@ -1588,25 +1551,26 @@ class SCDown(nn.Module):
|
|
|
1588
1551
|
|
|
1589
1552
|
|
|
1590
1553
|
class TorchVision(nn.Module):
|
|
1591
|
-
"""
|
|
1592
|
-
TorchVision module to allow loading any torchvision model.
|
|
1554
|
+
"""TorchVision module to allow loading any torchvision model.
|
|
1593
1555
|
|
|
1594
|
-
This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and
|
|
1595
|
-
|
|
1596
|
-
Attributes:
|
|
1597
|
-
m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.
|
|
1556
|
+
This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and
|
|
1557
|
+
customize the model by truncating or unwrapping layers.
|
|
1598
1558
|
|
|
1599
1559
|
Args:
|
|
1600
1560
|
model (str): Name of the torchvision model to load.
|
|
1601
1561
|
weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
|
|
1602
|
-
unwrap (bool, optional):
|
|
1562
|
+
unwrap (bool, optional): Unwraps the model to a sequential containing all but the last `truncate` layers.
|
|
1603
1563
|
truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
|
|
1604
1564
|
split (bool, optional): Returns output from intermediate child modules as list. Default is False.
|
|
1565
|
+
|
|
1566
|
+
Attributes:
|
|
1567
|
+
m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.
|
|
1605
1568
|
"""
|
|
1606
1569
|
|
|
1607
|
-
def __init__(
|
|
1608
|
-
""
|
|
1609
|
-
|
|
1570
|
+
def __init__(
|
|
1571
|
+
self, model: str, weights: str = "DEFAULT", unwrap: bool = True, truncate: int = 2, split: bool = False
|
|
1572
|
+
):
|
|
1573
|
+
"""Load the model and weights from torchvision.
|
|
1610
1574
|
|
|
1611
1575
|
Args:
|
|
1612
1576
|
model (str): Name of the torchvision model to load.
|
|
@@ -1632,15 +1596,14 @@ class TorchVision(nn.Module):
|
|
|
1632
1596
|
self.split = False
|
|
1633
1597
|
self.m.head = self.m.heads = nn.Identity()
|
|
1634
1598
|
|
|
1635
|
-
def forward(self, x):
|
|
1636
|
-
"""
|
|
1637
|
-
Forward pass through the model.
|
|
1599
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1600
|
+
"""Forward pass through the model.
|
|
1638
1601
|
|
|
1639
1602
|
Args:
|
|
1640
1603
|
x (torch.Tensor): Input tensor.
|
|
1641
1604
|
|
|
1642
1605
|
Returns:
|
|
1643
|
-
(torch.Tensor |
|
|
1606
|
+
(torch.Tensor | list[torch.Tensor]): Output tensor or list of tensors.
|
|
1644
1607
|
"""
|
|
1645
1608
|
if self.split:
|
|
1646
1609
|
y = [x]
|
|
@@ -1651,8 +1614,7 @@ class TorchVision(nn.Module):
|
|
|
1651
1614
|
|
|
1652
1615
|
|
|
1653
1616
|
class AAttn(nn.Module):
|
|
1654
|
-
"""
|
|
1655
|
-
Area-attention module for YOLO models, providing efficient attention mechanisms.
|
|
1617
|
+
"""Area-attention module for YOLO models, providing efficient attention mechanisms.
|
|
1656
1618
|
|
|
1657
1619
|
This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
|
|
1658
1620
|
making it particularly effective for object detection tasks.
|
|
@@ -1676,14 +1638,13 @@ class AAttn(nn.Module):
|
|
|
1676
1638
|
torch.Size([1, 256, 32, 32])
|
|
1677
1639
|
"""
|
|
1678
1640
|
|
|
1679
|
-
def __init__(self, dim, num_heads, area=1):
|
|
1680
|
-
"""
|
|
1681
|
-
Initialize an Area-attention module for YOLO models.
|
|
1641
|
+
def __init__(self, dim: int, num_heads: int, area: int = 1):
|
|
1642
|
+
"""Initialize an Area-attention module for YOLO models.
|
|
1682
1643
|
|
|
1683
1644
|
Args:
|
|
1684
1645
|
dim (int): Number of hidden channels.
|
|
1685
1646
|
num_heads (int): Number of heads into which the attention mechanism is divided.
|
|
1686
|
-
area (int): Number of areas the feature map is divided
|
|
1647
|
+
area (int): Number of areas the feature map is divided.
|
|
1687
1648
|
"""
|
|
1688
1649
|
super().__init__()
|
|
1689
1650
|
self.area = area
|
|
@@ -1696,9 +1657,8 @@ class AAttn(nn.Module):
|
|
|
1696
1657
|
self.proj = Conv(all_head_dim, dim, 1, act=False)
|
|
1697
1658
|
self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
|
|
1698
1659
|
|
|
1699
|
-
def forward(self, x):
|
|
1700
|
-
"""
|
|
1701
|
-
Process the input tensor through the area-attention.
|
|
1660
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1661
|
+
"""Process the input tensor through the area-attention.
|
|
1702
1662
|
|
|
1703
1663
|
Args:
|
|
1704
1664
|
x (torch.Tensor): Input tensor.
|
|
@@ -1737,8 +1697,7 @@ class AAttn(nn.Module):
|
|
|
1737
1697
|
|
|
1738
1698
|
|
|
1739
1699
|
class ABlock(nn.Module):
|
|
1740
|
-
"""
|
|
1741
|
-
Area-attention block module for efficient feature extraction in YOLO models.
|
|
1700
|
+
"""Area-attention block module for efficient feature extraction in YOLO models.
|
|
1742
1701
|
|
|
1743
1702
|
This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
|
|
1744
1703
|
It uses a novel area-based attention approach that is more efficient than traditional self-attention while
|
|
@@ -1760,9 +1719,8 @@ class ABlock(nn.Module):
|
|
|
1760
1719
|
torch.Size([1, 256, 32, 32])
|
|
1761
1720
|
"""
|
|
1762
1721
|
|
|
1763
|
-
def __init__(self, dim, num_heads, mlp_ratio=1.2, area=1):
|
|
1764
|
-
"""
|
|
1765
|
-
Initialize an Area-attention block module.
|
|
1722
|
+
def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 1.2, area: int = 1):
|
|
1723
|
+
"""Initialize an Area-attention block module.
|
|
1766
1724
|
|
|
1767
1725
|
Args:
|
|
1768
1726
|
dim (int): Number of input channels.
|
|
@@ -1778,9 +1736,8 @@ class ABlock(nn.Module):
|
|
|
1778
1736
|
|
|
1779
1737
|
self.apply(self._init_weights)
|
|
1780
1738
|
|
|
1781
|
-
def _init_weights(self, m):
|
|
1782
|
-
"""
|
|
1783
|
-
Initialize weights using a truncated normal distribution.
|
|
1739
|
+
def _init_weights(self, m: nn.Module):
|
|
1740
|
+
"""Initialize weights using a truncated normal distribution.
|
|
1784
1741
|
|
|
1785
1742
|
Args:
|
|
1786
1743
|
m (nn.Module): Module to initialize.
|
|
@@ -1790,9 +1747,8 @@ class ABlock(nn.Module):
|
|
|
1790
1747
|
if m.bias is not None:
|
|
1791
1748
|
nn.init.constant_(m.bias, 0)
|
|
1792
1749
|
|
|
1793
|
-
def forward(self, x):
|
|
1794
|
-
"""
|
|
1795
|
-
Forward pass through ABlock.
|
|
1750
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1751
|
+
"""Forward pass through ABlock.
|
|
1796
1752
|
|
|
1797
1753
|
Args:
|
|
1798
1754
|
x (torch.Tensor): Input tensor.
|
|
@@ -1805,8 +1761,7 @@ class ABlock(nn.Module):
|
|
|
1805
1761
|
|
|
1806
1762
|
|
|
1807
1763
|
class A2C2f(nn.Module):
|
|
1808
|
-
"""
|
|
1809
|
-
Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
|
|
1764
|
+
"""Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
|
|
1810
1765
|
|
|
1811
1766
|
This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
|
|
1812
1767
|
processing. It supports both area-attention and standard convolution modes.
|
|
@@ -1828,9 +1783,20 @@ class A2C2f(nn.Module):
|
|
|
1828
1783
|
torch.Size([1, 512, 32, 32])
|
|
1829
1784
|
"""
|
|
1830
1785
|
|
|
1831
|
-
def __init__(
|
|
1832
|
-
|
|
1833
|
-
|
|
1786
|
+
def __init__(
|
|
1787
|
+
self,
|
|
1788
|
+
c1: int,
|
|
1789
|
+
c2: int,
|
|
1790
|
+
n: int = 1,
|
|
1791
|
+
a2: bool = True,
|
|
1792
|
+
area: int = 1,
|
|
1793
|
+
residual: bool = False,
|
|
1794
|
+
mlp_ratio: float = 2.0,
|
|
1795
|
+
e: float = 0.5,
|
|
1796
|
+
g: int = 1,
|
|
1797
|
+
shortcut: bool = True,
|
|
1798
|
+
):
|
|
1799
|
+
"""Initialize Area-Attention C2f module.
|
|
1834
1800
|
|
|
1835
1801
|
Args:
|
|
1836
1802
|
c1 (int): Number of input channels.
|
|
@@ -1859,9 +1825,8 @@ class A2C2f(nn.Module):
|
|
|
1859
1825
|
for _ in range(n)
|
|
1860
1826
|
)
|
|
1861
1827
|
|
|
1862
|
-
def forward(self, x):
|
|
1863
|
-
"""
|
|
1864
|
-
Forward pass through A2C2f layer.
|
|
1828
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1829
|
+
"""Forward pass through A2C2f layer.
|
|
1865
1830
|
|
|
1866
1831
|
Args:
|
|
1867
1832
|
x (torch.Tensor): Input tensor.
|
|
@@ -1873,20 +1838,26 @@ class A2C2f(nn.Module):
|
|
|
1873
1838
|
y.extend(m(y[-1]) for m in self.m)
|
|
1874
1839
|
y = self.cv2(torch.cat(y, 1))
|
|
1875
1840
|
if self.gamma is not None:
|
|
1876
|
-
return x + self.gamma.view(-1,
|
|
1841
|
+
return x + self.gamma.view(-1, self.gamma.shape[0], 1, 1) * y
|
|
1877
1842
|
return y
|
|
1878
1843
|
|
|
1879
1844
|
|
|
1880
1845
|
class SwiGLUFFN(nn.Module):
|
|
1881
1846
|
"""SwiGLU Feed-Forward Network for transformer-based architectures."""
|
|
1882
1847
|
|
|
1883
|
-
def __init__(self, gc, ec, e=4) -> None:
|
|
1884
|
-
"""Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.
|
|
1848
|
+
def __init__(self, gc: int, ec: int, e: int = 4) -> None:
|
|
1849
|
+
"""Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.
|
|
1850
|
+
|
|
1851
|
+
Args:
|
|
1852
|
+
gc (int): Guide channels.
|
|
1853
|
+
ec (int): Embedding channels.
|
|
1854
|
+
e (int): Expansion factor.
|
|
1855
|
+
"""
|
|
1885
1856
|
super().__init__()
|
|
1886
1857
|
self.w12 = nn.Linear(gc, e * ec)
|
|
1887
1858
|
self.w3 = nn.Linear(e * ec // 2, ec)
|
|
1888
1859
|
|
|
1889
|
-
def forward(self, x):
|
|
1860
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1890
1861
|
"""Apply SwiGLU transformation to input features."""
|
|
1891
1862
|
x12 = self.w12(x)
|
|
1892
1863
|
x1, x2 = x12.chunk(2, dim=-1)
|
|
@@ -1897,8 +1868,12 @@ class SwiGLUFFN(nn.Module):
|
|
|
1897
1868
|
class Residual(nn.Module):
|
|
1898
1869
|
"""Residual connection wrapper for neural network modules."""
|
|
1899
1870
|
|
|
1900
|
-
def __init__(self, m) -> None:
|
|
1901
|
-
"""Initialize residual module with the wrapped module.
|
|
1871
|
+
def __init__(self, m: nn.Module) -> None:
|
|
1872
|
+
"""Initialize residual module with the wrapped module.
|
|
1873
|
+
|
|
1874
|
+
Args:
|
|
1875
|
+
m (nn.Module): Module to wrap with residual connection.
|
|
1876
|
+
"""
|
|
1902
1877
|
super().__init__()
|
|
1903
1878
|
self.m = m
|
|
1904
1879
|
nn.init.zeros_(self.m.w3.bias)
|
|
@@ -1906,7 +1881,7 @@ class Residual(nn.Module):
|
|
|
1906
1881
|
# nn.init.constant_(self.m.w3.weight, 1e-6)
|
|
1907
1882
|
nn.init.zeros_(self.m.w3.weight)
|
|
1908
1883
|
|
|
1909
|
-
def forward(self, x):
|
|
1884
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
1910
1885
|
"""Apply residual connection to input features."""
|
|
1911
1886
|
return x + self.m(x)
|
|
1912
1887
|
|
|
@@ -1914,8 +1889,14 @@ class Residual(nn.Module):
|
|
|
1914
1889
|
class SAVPE(nn.Module):
|
|
1915
1890
|
"""Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
|
|
1916
1891
|
|
|
1917
|
-
def __init__(self, ch, c3, embed):
|
|
1918
|
-
"""Initialize SAVPE module with channels, intermediate channels, and embedding dimension.
|
|
1892
|
+
def __init__(self, ch: list[int], c3: int, embed: int):
|
|
1893
|
+
"""Initialize SAVPE module with channels, intermediate channels, and embedding dimension.
|
|
1894
|
+
|
|
1895
|
+
Args:
|
|
1896
|
+
ch (list[int]): List of input channel dimensions.
|
|
1897
|
+
c3 (int): Intermediate channels.
|
|
1898
|
+
embed (int): Embedding dimension.
|
|
1899
|
+
"""
|
|
1919
1900
|
super().__init__()
|
|
1920
1901
|
self.cv1 = nn.ModuleList(
|
|
1921
1902
|
nn.Sequential(
|
|
@@ -1935,7 +1916,7 @@ class SAVPE(nn.Module):
|
|
|
1935
1916
|
self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
|
|
1936
1917
|
self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
|
|
1937
1918
|
|
|
1938
|
-
def forward(self, x, vp):
|
|
1919
|
+
def forward(self, x: list[torch.Tensor], vp: torch.Tensor) -> torch.Tensor:
|
|
1939
1920
|
"""Process input features and visual prompts to generate enhanced embeddings."""
|
|
1940
1921
|
y = [self.cv2[i](xi) for i, xi in enumerate(x)]
|
|
1941
1922
|
y = self.cv4(torch.cat(y, dim=1))
|
|
@@ -1958,9 +1939,7 @@ class SAVPE(nn.Module):
|
|
|
1958
1939
|
vp = vp.reshape(B, Q, 1, -1)
|
|
1959
1940
|
|
|
1960
1941
|
score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
|
|
1961
|
-
|
|
1962
|
-
score = F.softmax(score, dim=-1, dtype=torch.float).to(score.dtype)
|
|
1963
|
-
|
|
1942
|
+
score = F.softmax(score, dim=-1).to(y.dtype)
|
|
1964
1943
|
aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
|
|
1965
1944
|
|
|
1966
1945
|
return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
|