dgenerate-ultralytics-headless 8.3.214__py3-none-any.whl → 8.3.248__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/METADATA +13 -14
- dgenerate_ultralytics_headless-8.3.248.dist-info/RECORD +298 -0
- tests/__init__.py +5 -7
- tests/conftest.py +8 -15
- tests/test_cli.py +1 -1
- tests/test_cuda.py +5 -8
- tests/test_engine.py +1 -1
- tests/test_exports.py +57 -12
- tests/test_integrations.py +4 -4
- tests/test_python.py +84 -53
- tests/test_solutions.py +160 -151
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +56 -62
- ultralytics/cfg/datasets/Argoverse.yaml +7 -6
- ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
- ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
- ultralytics/cfg/datasets/ImageNet.yaml +1 -1
- ultralytics/cfg/datasets/VOC.yaml +15 -16
- ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
- ultralytics/cfg/datasets/coco-pose.yaml +21 -0
- ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
- ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
- ultralytics/cfg/datasets/dog-pose.yaml +28 -0
- ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
- ultralytics/cfg/datasets/dota8.yaml +2 -2
- ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
- ultralytics/cfg/datasets/kitti.yaml +27 -0
- ultralytics/cfg/datasets/lvis.yaml +5 -5
- ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
- ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
- ultralytics/cfg/datasets/xView.yaml +16 -16
- ultralytics/cfg/default.yaml +1 -1
- ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
- ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
- ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
- ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
- ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
- ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
- ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
- ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
- ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
- ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
- ultralytics/cfg/models/v6/yolov6.yaml +1 -1
- ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
- ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
- ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
- ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
- ultralytics/data/__init__.py +4 -4
- ultralytics/data/annotator.py +3 -4
- ultralytics/data/augment.py +285 -475
- ultralytics/data/base.py +18 -26
- ultralytics/data/build.py +147 -25
- ultralytics/data/converter.py +36 -46
- ultralytics/data/dataset.py +46 -74
- ultralytics/data/loaders.py +42 -49
- ultralytics/data/split.py +5 -6
- ultralytics/data/split_dota.py +8 -15
- ultralytics/data/utils.py +34 -43
- ultralytics/engine/exporter.py +319 -237
- ultralytics/engine/model.py +148 -188
- ultralytics/engine/predictor.py +29 -38
- ultralytics/engine/results.py +177 -311
- ultralytics/engine/trainer.py +83 -59
- ultralytics/engine/tuner.py +23 -34
- ultralytics/engine/validator.py +39 -22
- ultralytics/hub/__init__.py +16 -19
- ultralytics/hub/auth.py +6 -12
- ultralytics/hub/google/__init__.py +7 -10
- ultralytics/hub/session.py +15 -25
- ultralytics/hub/utils.py +5 -8
- ultralytics/models/__init__.py +1 -1
- ultralytics/models/fastsam/__init__.py +1 -1
- ultralytics/models/fastsam/model.py +8 -10
- ultralytics/models/fastsam/predict.py +17 -29
- ultralytics/models/fastsam/utils.py +1 -2
- ultralytics/models/fastsam/val.py +5 -7
- ultralytics/models/nas/__init__.py +1 -1
- ultralytics/models/nas/model.py +5 -8
- ultralytics/models/nas/predict.py +7 -9
- ultralytics/models/nas/val.py +1 -2
- ultralytics/models/rtdetr/__init__.py +1 -1
- ultralytics/models/rtdetr/model.py +5 -8
- ultralytics/models/rtdetr/predict.py +15 -19
- ultralytics/models/rtdetr/train.py +10 -13
- ultralytics/models/rtdetr/val.py +21 -23
- ultralytics/models/sam/__init__.py +15 -2
- ultralytics/models/sam/amg.py +14 -20
- ultralytics/models/sam/build.py +26 -19
- ultralytics/models/sam/build_sam3.py +377 -0
- ultralytics/models/sam/model.py +29 -32
- ultralytics/models/sam/modules/blocks.py +83 -144
- ultralytics/models/sam/modules/decoders.py +19 -37
- ultralytics/models/sam/modules/encoders.py +44 -101
- ultralytics/models/sam/modules/memory_attention.py +16 -30
- ultralytics/models/sam/modules/sam.py +200 -73
- ultralytics/models/sam/modules/tiny_encoder.py +64 -83
- ultralytics/models/sam/modules/transformer.py +18 -28
- ultralytics/models/sam/modules/utils.py +174 -50
- ultralytics/models/sam/predict.py +2248 -350
- ultralytics/models/sam/sam3/__init__.py +3 -0
- ultralytics/models/sam/sam3/decoder.py +546 -0
- ultralytics/models/sam/sam3/encoder.py +529 -0
- ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
- ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
- ultralytics/models/sam/sam3/model_misc.py +199 -0
- ultralytics/models/sam/sam3/necks.py +129 -0
- ultralytics/models/sam/sam3/sam3_image.py +339 -0
- ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
- ultralytics/models/sam/sam3/vitdet.py +547 -0
- ultralytics/models/sam/sam3/vl_combiner.py +160 -0
- ultralytics/models/utils/loss.py +14 -26
- ultralytics/models/utils/ops.py +13 -17
- ultralytics/models/yolo/__init__.py +1 -1
- ultralytics/models/yolo/classify/predict.py +9 -12
- ultralytics/models/yolo/classify/train.py +11 -32
- ultralytics/models/yolo/classify/val.py +29 -28
- ultralytics/models/yolo/detect/predict.py +7 -10
- ultralytics/models/yolo/detect/train.py +11 -20
- ultralytics/models/yolo/detect/val.py +70 -58
- ultralytics/models/yolo/model.py +36 -53
- ultralytics/models/yolo/obb/predict.py +5 -14
- ultralytics/models/yolo/obb/train.py +11 -14
- ultralytics/models/yolo/obb/val.py +39 -36
- ultralytics/models/yolo/pose/__init__.py +1 -1
- ultralytics/models/yolo/pose/predict.py +6 -21
- ultralytics/models/yolo/pose/train.py +10 -15
- ultralytics/models/yolo/pose/val.py +38 -57
- ultralytics/models/yolo/segment/predict.py +14 -18
- ultralytics/models/yolo/segment/train.py +3 -6
- ultralytics/models/yolo/segment/val.py +93 -45
- ultralytics/models/yolo/world/train.py +8 -14
- ultralytics/models/yolo/world/train_world.py +11 -34
- ultralytics/models/yolo/yoloe/__init__.py +7 -7
- ultralytics/models/yolo/yoloe/predict.py +16 -23
- ultralytics/models/yolo/yoloe/train.py +30 -43
- ultralytics/models/yolo/yoloe/train_seg.py +5 -10
- ultralytics/models/yolo/yoloe/val.py +15 -20
- ultralytics/nn/__init__.py +7 -7
- ultralytics/nn/autobackend.py +145 -77
- ultralytics/nn/modules/__init__.py +60 -60
- ultralytics/nn/modules/activation.py +4 -6
- ultralytics/nn/modules/block.py +132 -216
- ultralytics/nn/modules/conv.py +52 -97
- ultralytics/nn/modules/head.py +50 -103
- ultralytics/nn/modules/transformer.py +76 -88
- ultralytics/nn/modules/utils.py +16 -21
- ultralytics/nn/tasks.py +94 -154
- ultralytics/nn/text_model.py +40 -67
- ultralytics/solutions/__init__.py +12 -12
- ultralytics/solutions/ai_gym.py +11 -17
- ultralytics/solutions/analytics.py +15 -16
- ultralytics/solutions/config.py +5 -6
- ultralytics/solutions/distance_calculation.py +10 -13
- ultralytics/solutions/heatmap.py +7 -13
- ultralytics/solutions/instance_segmentation.py +5 -8
- ultralytics/solutions/object_blurrer.py +7 -10
- ultralytics/solutions/object_counter.py +12 -19
- ultralytics/solutions/object_cropper.py +8 -14
- ultralytics/solutions/parking_management.py +33 -31
- ultralytics/solutions/queue_management.py +10 -12
- ultralytics/solutions/region_counter.py +9 -12
- ultralytics/solutions/security_alarm.py +15 -20
- ultralytics/solutions/similarity_search.py +10 -15
- ultralytics/solutions/solutions.py +75 -74
- ultralytics/solutions/speed_estimation.py +7 -10
- ultralytics/solutions/streamlit_inference.py +2 -4
- ultralytics/solutions/templates/similarity-search.html +7 -18
- ultralytics/solutions/trackzone.py +7 -10
- ultralytics/solutions/vision_eye.py +5 -8
- ultralytics/trackers/__init__.py +1 -1
- ultralytics/trackers/basetrack.py +3 -5
- ultralytics/trackers/bot_sort.py +10 -27
- ultralytics/trackers/byte_tracker.py +14 -30
- ultralytics/trackers/track.py +3 -6
- ultralytics/trackers/utils/gmc.py +11 -22
- ultralytics/trackers/utils/kalman_filter.py +37 -48
- ultralytics/trackers/utils/matching.py +12 -15
- ultralytics/utils/__init__.py +116 -116
- ultralytics/utils/autobatch.py +2 -4
- ultralytics/utils/autodevice.py +17 -18
- ultralytics/utils/benchmarks.py +32 -46
- ultralytics/utils/callbacks/base.py +8 -10
- ultralytics/utils/callbacks/clearml.py +5 -13
- ultralytics/utils/callbacks/comet.py +32 -46
- ultralytics/utils/callbacks/dvc.py +13 -18
- ultralytics/utils/callbacks/mlflow.py +4 -5
- ultralytics/utils/callbacks/neptune.py +7 -15
- ultralytics/utils/callbacks/platform.py +314 -38
- ultralytics/utils/callbacks/raytune.py +3 -4
- ultralytics/utils/callbacks/tensorboard.py +23 -31
- ultralytics/utils/callbacks/wb.py +10 -13
- ultralytics/utils/checks.py +99 -76
- ultralytics/utils/cpu.py +3 -8
- ultralytics/utils/dist.py +8 -12
- ultralytics/utils/downloads.py +20 -30
- ultralytics/utils/errors.py +6 -14
- ultralytics/utils/events.py +2 -4
- ultralytics/utils/export/__init__.py +4 -236
- ultralytics/utils/export/engine.py +237 -0
- ultralytics/utils/export/imx.py +91 -55
- ultralytics/utils/export/tensorflow.py +231 -0
- ultralytics/utils/files.py +24 -28
- ultralytics/utils/git.py +9 -11
- ultralytics/utils/instance.py +30 -51
- ultralytics/utils/logger.py +212 -114
- ultralytics/utils/loss.py +14 -22
- ultralytics/utils/metrics.py +126 -155
- ultralytics/utils/nms.py +13 -16
- ultralytics/utils/ops.py +107 -165
- ultralytics/utils/patches.py +33 -21
- ultralytics/utils/plotting.py +72 -80
- ultralytics/utils/tal.py +25 -39
- ultralytics/utils/torch_utils.py +52 -78
- ultralytics/utils/tqdm.py +20 -20
- ultralytics/utils/triton.py +13 -19
- ultralytics/utils/tuner.py +17 -5
- dgenerate_ultralytics_headless-8.3.214.dist-info/RECORD +0 -283
- {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/top_level.txt +0 -0
|
@@ -22,12 +22,11 @@ from ultralytics.utils.instance import to_2tuple
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class Conv2d_BN(torch.nn.Sequential):
|
|
25
|
-
"""
|
|
26
|
-
A sequential container that performs 2D convolution followed by batch normalization.
|
|
25
|
+
"""A sequential container that performs 2D convolution followed by batch normalization.
|
|
27
26
|
|
|
28
|
-
This module combines a 2D convolution layer with batch normalization, providing a common building block
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
This module combines a 2D convolution layer with batch normalization, providing a common building block for
|
|
28
|
+
convolutional neural networks. The batch normalization weights and biases are initialized to specific values for
|
|
29
|
+
optimal training performance.
|
|
31
30
|
|
|
32
31
|
Attributes:
|
|
33
32
|
c (torch.nn.Conv2d): 2D convolution layer.
|
|
@@ -52,8 +51,7 @@ class Conv2d_BN(torch.nn.Sequential):
|
|
|
52
51
|
groups: int = 1,
|
|
53
52
|
bn_weight_init: float = 1,
|
|
54
53
|
):
|
|
55
|
-
"""
|
|
56
|
-
Initialize a sequential container with 2D convolution followed by batch normalization.
|
|
54
|
+
"""Initialize a sequential container with 2D convolution followed by batch normalization.
|
|
57
55
|
|
|
58
56
|
Args:
|
|
59
57
|
a (int): Number of input channels.
|
|
@@ -74,11 +72,10 @@ class Conv2d_BN(torch.nn.Sequential):
|
|
|
74
72
|
|
|
75
73
|
|
|
76
74
|
class PatchEmbed(nn.Module):
|
|
77
|
-
"""
|
|
78
|
-
Embed images into patches and project them into a specified embedding dimension.
|
|
75
|
+
"""Embed images into patches and project them into a specified embedding dimension.
|
|
79
76
|
|
|
80
|
-
This module converts input images into patch embeddings using a sequence of convolutional layers,
|
|
81
|
-
|
|
77
|
+
This module converts input images into patch embeddings using a sequence of convolutional layers, effectively
|
|
78
|
+
downsampling the spatial dimensions while increasing the channel dimension.
|
|
82
79
|
|
|
83
80
|
Attributes:
|
|
84
81
|
patches_resolution (tuple[int, int]): Resolution of the patches after embedding.
|
|
@@ -97,8 +94,7 @@ class PatchEmbed(nn.Module):
|
|
|
97
94
|
"""
|
|
98
95
|
|
|
99
96
|
def __init__(self, in_chans: int, embed_dim: int, resolution: int, activation):
|
|
100
|
-
"""
|
|
101
|
-
Initialize patch embedding with convolutional layers for image-to-patch conversion and projection.
|
|
97
|
+
"""Initialize patch embedding with convolutional layers for image-to-patch conversion and projection.
|
|
102
98
|
|
|
103
99
|
Args:
|
|
104
100
|
in_chans (int): Number of input channels.
|
|
@@ -125,11 +121,10 @@ class PatchEmbed(nn.Module):
|
|
|
125
121
|
|
|
126
122
|
|
|
127
123
|
class MBConv(nn.Module):
|
|
128
|
-
"""
|
|
129
|
-
Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.
|
|
124
|
+
"""Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.
|
|
130
125
|
|
|
131
|
-
This module implements the mobile inverted bottleneck convolution with expansion, depthwise convolution,
|
|
132
|
-
|
|
126
|
+
This module implements the mobile inverted bottleneck convolution with expansion, depthwise convolution, and
|
|
127
|
+
projection phases, along with residual connections for improved gradient flow.
|
|
133
128
|
|
|
134
129
|
Attributes:
|
|
135
130
|
in_chans (int): Number of input channels.
|
|
@@ -153,8 +148,7 @@ class MBConv(nn.Module):
|
|
|
153
148
|
"""
|
|
154
149
|
|
|
155
150
|
def __init__(self, in_chans: int, out_chans: int, expand_ratio: float, activation, drop_path: float):
|
|
156
|
-
"""
|
|
157
|
-
Initialize the MBConv layer with specified input/output channels, expansion ratio, and activation.
|
|
151
|
+
"""Initialize the MBConv layer with specified input/output channels, expansion ratio, and activation.
|
|
158
152
|
|
|
159
153
|
Args:
|
|
160
154
|
in_chans (int): Number of input channels.
|
|
@@ -195,12 +189,11 @@ class MBConv(nn.Module):
|
|
|
195
189
|
|
|
196
190
|
|
|
197
191
|
class PatchMerging(nn.Module):
|
|
198
|
-
"""
|
|
199
|
-
Merge neighboring patches in the feature map and project to a new dimension.
|
|
192
|
+
"""Merge neighboring patches in the feature map and project to a new dimension.
|
|
200
193
|
|
|
201
|
-
This class implements a patch merging operation that combines spatial information and adjusts the feature
|
|
202
|
-
|
|
203
|
-
|
|
194
|
+
This class implements a patch merging operation that combines spatial information and adjusts the feature dimension
|
|
195
|
+
using a series of convolutional layers with batch normalization. It effectively reduces spatial resolution while
|
|
196
|
+
potentially increasing channel dimensions.
|
|
204
197
|
|
|
205
198
|
Attributes:
|
|
206
199
|
input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
|
|
@@ -221,8 +214,7 @@ class PatchMerging(nn.Module):
|
|
|
221
214
|
"""
|
|
222
215
|
|
|
223
216
|
def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
|
|
224
|
-
"""
|
|
225
|
-
Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
|
|
217
|
+
"""Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
|
|
226
218
|
|
|
227
219
|
Args:
|
|
228
220
|
input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
|
|
@@ -259,11 +251,10 @@ class PatchMerging(nn.Module):
|
|
|
259
251
|
|
|
260
252
|
|
|
261
253
|
class ConvLayer(nn.Module):
|
|
262
|
-
"""
|
|
263
|
-
Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
|
|
254
|
+
"""Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
|
|
264
255
|
|
|
265
|
-
This layer optionally applies downsample operations to the output and supports gradient checkpointing
|
|
266
|
-
|
|
256
|
+
This layer optionally applies downsample operations to the output and supports gradient checkpointing for memory
|
|
257
|
+
efficiency during training.
|
|
267
258
|
|
|
268
259
|
Attributes:
|
|
269
260
|
dim (int): Dimensionality of the input and output.
|
|
@@ -293,11 +284,10 @@ class ConvLayer(nn.Module):
|
|
|
293
284
|
out_dim: int | None = None,
|
|
294
285
|
conv_expand_ratio: float = 4.0,
|
|
295
286
|
):
|
|
296
|
-
"""
|
|
297
|
-
Initialize the ConvLayer with the given dimensions and settings.
|
|
287
|
+
"""Initialize the ConvLayer with the given dimensions and settings.
|
|
298
288
|
|
|
299
|
-
This layer consists of multiple MobileNetV3-style inverted bottleneck convolutions (MBConv) and
|
|
300
|
-
|
|
289
|
+
This layer consists of multiple MobileNetV3-style inverted bottleneck convolutions (MBConv) and optionally
|
|
290
|
+
applies downsampling to the output.
|
|
301
291
|
|
|
302
292
|
Args:
|
|
303
293
|
dim (int): The dimensionality of the input and output.
|
|
@@ -307,7 +297,7 @@ class ConvLayer(nn.Module):
|
|
|
307
297
|
drop_path (float | list[float], optional): Drop path rate. Single float or a list of floats for each MBConv.
|
|
308
298
|
downsample (Optional[nn.Module], optional): Function for downsampling the output. None to skip downsampling.
|
|
309
299
|
use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
|
|
310
|
-
out_dim (Optional[int], optional):
|
|
300
|
+
out_dim (Optional[int], optional): Output dimensions. None means it will be the same as `dim`.
|
|
311
301
|
conv_expand_ratio (float, optional): Expansion ratio for the MBConv layers.
|
|
312
302
|
"""
|
|
313
303
|
super().__init__()
|
|
@@ -345,11 +335,10 @@ class ConvLayer(nn.Module):
|
|
|
345
335
|
|
|
346
336
|
|
|
347
337
|
class MLP(nn.Module):
|
|
348
|
-
"""
|
|
349
|
-
Multi-layer Perceptron (MLP) module for transformer architectures.
|
|
338
|
+
"""Multi-layer Perceptron (MLP) module for transformer architectures.
|
|
350
339
|
|
|
351
|
-
This module applies layer normalization, two fully-connected layers with an activation function in between,
|
|
352
|
-
|
|
340
|
+
This module applies layer normalization, two fully-connected layers with an activation function in between, and
|
|
341
|
+
dropout. It is commonly used in transformer-based architectures for processing token embeddings.
|
|
353
342
|
|
|
354
343
|
Attributes:
|
|
355
344
|
norm (nn.LayerNorm): Layer normalization applied to the input.
|
|
@@ -376,8 +365,7 @@ class MLP(nn.Module):
|
|
|
376
365
|
activation=nn.GELU,
|
|
377
366
|
drop: float = 0.0,
|
|
378
367
|
):
|
|
379
|
-
"""
|
|
380
|
-
Initialize a multi-layer perceptron with configurable input, hidden, and output dimensions.
|
|
368
|
+
"""Initialize a multi-layer perceptron with configurable input, hidden, and output dimensions.
|
|
381
369
|
|
|
382
370
|
Args:
|
|
383
371
|
in_features (int): Number of input features.
|
|
@@ -406,12 +394,11 @@ class MLP(nn.Module):
|
|
|
406
394
|
|
|
407
395
|
|
|
408
396
|
class Attention(torch.nn.Module):
|
|
409
|
-
"""
|
|
410
|
-
Multi-head attention module with spatial awareness and trainable attention biases.
|
|
397
|
+
"""Multi-head attention module with spatial awareness and trainable attention biases.
|
|
411
398
|
|
|
412
|
-
This module implements a multi-head attention mechanism with support for spatial awareness, applying
|
|
413
|
-
|
|
414
|
-
|
|
399
|
+
This module implements a multi-head attention mechanism with support for spatial awareness, applying attention
|
|
400
|
+
biases based on spatial resolution. It includes trainable attention biases for each unique offset between spatial
|
|
401
|
+
positions in the resolution grid.
|
|
415
402
|
|
|
416
403
|
Attributes:
|
|
417
404
|
num_heads (int): Number of attention heads.
|
|
@@ -444,12 +431,11 @@ class Attention(torch.nn.Module):
|
|
|
444
431
|
attn_ratio: float = 4,
|
|
445
432
|
resolution: tuple[int, int] = (14, 14),
|
|
446
433
|
):
|
|
447
|
-
"""
|
|
448
|
-
Initialize the Attention module for multi-head attention with spatial awareness.
|
|
434
|
+
"""Initialize the Attention module for multi-head attention with spatial awareness.
|
|
449
435
|
|
|
450
|
-
This module implements a multi-head attention mechanism with support for spatial awareness, applying
|
|
451
|
-
|
|
452
|
-
|
|
436
|
+
This module implements a multi-head attention mechanism with support for spatial awareness, applying attention
|
|
437
|
+
biases based on spatial resolution. It includes trainable attention biases for each unique offset between
|
|
438
|
+
spatial positions in the resolution grid.
|
|
453
439
|
|
|
454
440
|
Args:
|
|
455
441
|
dim (int): The dimensionality of the input and output.
|
|
@@ -521,12 +507,11 @@ class Attention(torch.nn.Module):
|
|
|
521
507
|
|
|
522
508
|
|
|
523
509
|
class TinyViTBlock(nn.Module):
|
|
524
|
-
"""
|
|
525
|
-
TinyViT Block that applies self-attention and a local convolution to the input.
|
|
510
|
+
"""TinyViT Block that applies self-attention and a local convolution to the input.
|
|
526
511
|
|
|
527
|
-
This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
|
|
528
|
-
|
|
529
|
-
|
|
512
|
+
This block is a key component of the TinyViT architecture, combining self-attention mechanisms with local
|
|
513
|
+
convolutions to process input features efficiently. It supports windowed attention for computational efficiency and
|
|
514
|
+
includes residual connections.
|
|
530
515
|
|
|
531
516
|
Attributes:
|
|
532
517
|
dim (int): The dimensionality of the input and output.
|
|
@@ -559,11 +544,10 @@ class TinyViTBlock(nn.Module):
|
|
|
559
544
|
local_conv_size: int = 3,
|
|
560
545
|
activation=nn.GELU,
|
|
561
546
|
):
|
|
562
|
-
"""
|
|
563
|
-
Initialize a TinyViT block with self-attention and local convolution.
|
|
547
|
+
"""Initialize a TinyViT block with self-attention and local convolution.
|
|
564
548
|
|
|
565
|
-
This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
|
|
566
|
-
|
|
549
|
+
This block is a key component of the TinyViT architecture, combining self-attention mechanisms with local
|
|
550
|
+
convolutions to process input features efficiently.
|
|
567
551
|
|
|
568
552
|
Args:
|
|
569
553
|
dim (int): Dimensionality of the input and output features.
|
|
@@ -644,8 +628,7 @@ class TinyViTBlock(nn.Module):
|
|
|
644
628
|
return x + self.drop_path(self.mlp(x))
|
|
645
629
|
|
|
646
630
|
def extra_repr(self) -> str:
|
|
647
|
-
"""
|
|
648
|
-
Return a string representation of the TinyViTBlock's parameters.
|
|
631
|
+
"""Return a string representation of the TinyViTBlock's parameters.
|
|
649
632
|
|
|
650
633
|
This method provides a formatted string containing key information about the TinyViTBlock, including its
|
|
651
634
|
dimension, input resolution, number of attention heads, window size, and MLP ratio.
|
|
@@ -665,12 +648,11 @@ class TinyViTBlock(nn.Module):
|
|
|
665
648
|
|
|
666
649
|
|
|
667
650
|
class BasicLayer(nn.Module):
|
|
668
|
-
"""
|
|
669
|
-
A basic TinyViT layer for one stage in a TinyViT architecture.
|
|
651
|
+
"""A basic TinyViT layer for one stage in a TinyViT architecture.
|
|
670
652
|
|
|
671
|
-
This class represents a single layer in the TinyViT model, consisting of multiple TinyViT blocks
|
|
672
|
-
|
|
673
|
-
|
|
653
|
+
This class represents a single layer in the TinyViT model, consisting of multiple TinyViT blocks and an optional
|
|
654
|
+
downsampling operation. It processes features at a specific resolution and dimensionality within the overall
|
|
655
|
+
architecture.
|
|
674
656
|
|
|
675
657
|
Attributes:
|
|
676
658
|
dim (int): The dimensionality of the input and output features.
|
|
@@ -704,11 +686,10 @@ class BasicLayer(nn.Module):
|
|
|
704
686
|
activation=nn.GELU,
|
|
705
687
|
out_dim: int | None = None,
|
|
706
688
|
):
|
|
707
|
-
"""
|
|
708
|
-
Initialize a BasicLayer in the TinyViT architecture.
|
|
689
|
+
"""Initialize a BasicLayer in the TinyViT architecture.
|
|
709
690
|
|
|
710
|
-
This layer consists of multiple TinyViT blocks and an optional downsampling operation. It is designed to
|
|
711
|
-
|
|
691
|
+
This layer consists of multiple TinyViT blocks and an optional downsampling operation. It is designed to process
|
|
692
|
+
feature maps at a specific resolution and dimensionality within the TinyViT model.
|
|
712
693
|
|
|
713
694
|
Args:
|
|
714
695
|
dim (int): Dimensionality of the input and output features.
|
|
@@ -718,12 +699,14 @@ class BasicLayer(nn.Module):
|
|
|
718
699
|
window_size (int): Size of the local window for attention computation.
|
|
719
700
|
mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
|
|
720
701
|
drop (float, optional): Dropout rate.
|
|
721
|
-
drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for
|
|
722
|
-
|
|
702
|
+
drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for
|
|
703
|
+
each block.
|
|
704
|
+
downsample (nn.Module | None, optional): Downsampling layer at the end of the layer. None to skip
|
|
705
|
+
downsampling.
|
|
723
706
|
use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
|
|
724
707
|
local_conv_size (int, optional): Kernel size for the local convolution in each TinyViT block.
|
|
725
708
|
activation (nn.Module): Activation function used in the MLP.
|
|
726
|
-
out_dim (int | None, optional): Output dimension after downsampling. None means it will be the same as
|
|
709
|
+
out_dim (int | None, optional): Output dimension after downsampling. None means it will be the same as dim.
|
|
727
710
|
"""
|
|
728
711
|
super().__init__()
|
|
729
712
|
self.dim = dim
|
|
@@ -768,12 +751,11 @@ class BasicLayer(nn.Module):
|
|
|
768
751
|
|
|
769
752
|
|
|
770
753
|
class TinyViT(nn.Module):
|
|
771
|
-
"""
|
|
772
|
-
TinyViT: A compact vision transformer architecture for efficient image classification and feature extraction.
|
|
754
|
+
"""TinyViT: A compact vision transformer architecture for efficient image classification and feature extraction.
|
|
773
755
|
|
|
774
|
-
This class implements the TinyViT model, which combines elements of vision transformers and convolutional
|
|
775
|
-
|
|
776
|
-
|
|
756
|
+
This class implements the TinyViT model, which combines elements of vision transformers and convolutional neural
|
|
757
|
+
networks for improved efficiency and performance on vision tasks. It features hierarchical processing with patch
|
|
758
|
+
embedding, multiple stages of attention and convolution blocks, and a feature refinement neck.
|
|
777
759
|
|
|
778
760
|
Attributes:
|
|
779
761
|
img_size (int): Input image size.
|
|
@@ -813,11 +795,10 @@ class TinyViT(nn.Module):
|
|
|
813
795
|
local_conv_size: int = 3,
|
|
814
796
|
layer_lr_decay: float = 1.0,
|
|
815
797
|
):
|
|
816
|
-
"""
|
|
817
|
-
Initialize the TinyViT model.
|
|
798
|
+
"""Initialize the TinyViT model.
|
|
818
799
|
|
|
819
|
-
This constructor sets up the TinyViT architecture, including patch embedding, multiple layers of
|
|
820
|
-
|
|
800
|
+
This constructor sets up the TinyViT architecture, including patch embedding, multiple layers of attention and
|
|
801
|
+
convolution blocks, and a classification head.
|
|
821
802
|
|
|
822
803
|
Args:
|
|
823
804
|
img_size (int, optional): Size of the input image.
|
|
@@ -11,12 +11,10 @@ from ultralytics.nn.modules import MLPBlock
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class TwoWayTransformer(nn.Module):
|
|
14
|
-
"""
|
|
15
|
-
A Two-Way Transformer module for simultaneous attention to image and query points.
|
|
14
|
+
"""A Two-Way Transformer module for simultaneous attention to image and query points.
|
|
16
15
|
|
|
17
|
-
This class implements a specialized transformer decoder that attends to an input image using queries with
|
|
18
|
-
|
|
19
|
-
cloud processing.
|
|
16
|
+
This class implements a specialized transformer decoder that attends to an input image using queries with supplied
|
|
17
|
+
positional embeddings. It's useful for tasks like object detection, image segmentation, and point cloud processing.
|
|
20
18
|
|
|
21
19
|
Attributes:
|
|
22
20
|
depth (int): Number of layers in the transformer.
|
|
@@ -48,8 +46,7 @@ class TwoWayTransformer(nn.Module):
|
|
|
48
46
|
activation: type[nn.Module] = nn.ReLU,
|
|
49
47
|
attention_downsample_rate: int = 2,
|
|
50
48
|
) -> None:
|
|
51
|
-
"""
|
|
52
|
-
Initialize a Two-Way Transformer for simultaneous attention to image and query points.
|
|
49
|
+
"""Initialize a Two-Way Transformer for simultaneous attention to image and query points.
|
|
53
50
|
|
|
54
51
|
Args:
|
|
55
52
|
depth (int): Number of layers in the transformer.
|
|
@@ -87,8 +84,7 @@ class TwoWayTransformer(nn.Module):
|
|
|
87
84
|
image_pe: torch.Tensor,
|
|
88
85
|
point_embedding: torch.Tensor,
|
|
89
86
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
90
|
-
"""
|
|
91
|
-
Process image and point embeddings through the Two-Way Transformer.
|
|
87
|
+
"""Process image and point embeddings through the Two-Way Transformer.
|
|
92
88
|
|
|
93
89
|
Args:
|
|
94
90
|
image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
|
|
@@ -127,12 +123,11 @@ class TwoWayTransformer(nn.Module):
|
|
|
127
123
|
|
|
128
124
|
|
|
129
125
|
class TwoWayAttentionBlock(nn.Module):
|
|
130
|
-
"""
|
|
131
|
-
A two-way attention block for simultaneous attention to image and query points.
|
|
126
|
+
"""A two-way attention block for simultaneous attention to image and query points.
|
|
132
127
|
|
|
133
128
|
This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
|
|
134
|
-
cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
|
|
135
|
-
|
|
129
|
+
cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense inputs to
|
|
130
|
+
sparse inputs.
|
|
136
131
|
|
|
137
132
|
Attributes:
|
|
138
133
|
self_attn (Attention): Self-attention layer for queries.
|
|
@@ -167,12 +162,11 @@ class TwoWayAttentionBlock(nn.Module):
|
|
|
167
162
|
attention_downsample_rate: int = 2,
|
|
168
163
|
skip_first_layer_pe: bool = False,
|
|
169
164
|
) -> None:
|
|
170
|
-
"""
|
|
171
|
-
Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.
|
|
165
|
+
"""Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.
|
|
172
166
|
|
|
173
167
|
This block implements a specialized transformer layer with four main components: self-attention on sparse
|
|
174
|
-
inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
|
|
175
|
-
|
|
168
|
+
inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of
|
|
169
|
+
dense inputs to sparse inputs.
|
|
176
170
|
|
|
177
171
|
Args:
|
|
178
172
|
embedding_dim (int): Channel dimension of the embeddings.
|
|
@@ -200,8 +194,7 @@ class TwoWayAttentionBlock(nn.Module):
|
|
|
200
194
|
def forward(
|
|
201
195
|
self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
|
|
202
196
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
203
|
-
"""
|
|
204
|
-
Apply two-way attention to process query and key embeddings in a transformer block.
|
|
197
|
+
"""Apply two-way attention to process query and key embeddings in a transformer block.
|
|
205
198
|
|
|
206
199
|
Args:
|
|
207
200
|
queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
|
|
@@ -245,11 +238,10 @@ class TwoWayAttentionBlock(nn.Module):
|
|
|
245
238
|
|
|
246
239
|
|
|
247
240
|
class Attention(nn.Module):
|
|
248
|
-
"""
|
|
249
|
-
An attention layer with downscaling capability for embedding size after projection.
|
|
241
|
+
"""An attention layer with downscaling capability for embedding size after projection.
|
|
250
242
|
|
|
251
|
-
This class implements a multi-head attention mechanism with the option to downsample the internal
|
|
252
|
-
|
|
243
|
+
This class implements a multi-head attention mechanism with the option to downsample the internal dimension of
|
|
244
|
+
queries, keys, and values.
|
|
253
245
|
|
|
254
246
|
Attributes:
|
|
255
247
|
embedding_dim (int): Dimensionality of input embeddings.
|
|
@@ -280,10 +272,9 @@ class Attention(nn.Module):
|
|
|
280
272
|
embedding_dim: int,
|
|
281
273
|
num_heads: int,
|
|
282
274
|
downsample_rate: int = 1,
|
|
283
|
-
kv_in_dim: int = None,
|
|
275
|
+
kv_in_dim: int | None = None,
|
|
284
276
|
) -> None:
|
|
285
|
-
"""
|
|
286
|
-
Initialize the Attention module with specified dimensions and settings.
|
|
277
|
+
"""Initialize the Attention module with specified dimensions and settings.
|
|
287
278
|
|
|
288
279
|
Args:
|
|
289
280
|
embedding_dim (int): Dimensionality of input embeddings.
|
|
@@ -321,8 +312,7 @@ class Attention(nn.Module):
|
|
|
321
312
|
return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
|
|
322
313
|
|
|
323
314
|
def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
|
|
324
|
-
"""
|
|
325
|
-
Apply multi-head attention to query, key, and value tensors with optional downsampling.
|
|
315
|
+
"""Apply multi-head attention to query, key, and value tensors with optional downsampling.
|
|
326
316
|
|
|
327
317
|
Args:
|
|
328
318
|
q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
|