dgenerate-ultralytics-headless 8.3.196__py3-none-any.whl → 8.3.248__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (243) hide show
  1. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/METADATA +33 -34
  2. dgenerate_ultralytics_headless-8.3.248.dist-info/RECORD +298 -0
  3. tests/__init__.py +5 -7
  4. tests/conftest.py +8 -15
  5. tests/test_cli.py +8 -10
  6. tests/test_cuda.py +9 -10
  7. tests/test_engine.py +29 -2
  8. tests/test_exports.py +69 -21
  9. tests/test_integrations.py +8 -11
  10. tests/test_python.py +109 -71
  11. tests/test_solutions.py +170 -159
  12. ultralytics/__init__.py +27 -9
  13. ultralytics/cfg/__init__.py +57 -64
  14. ultralytics/cfg/datasets/Argoverse.yaml +7 -6
  15. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  16. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  17. ultralytics/cfg/datasets/ImageNet.yaml +1 -1
  18. ultralytics/cfg/datasets/Objects365.yaml +19 -15
  19. ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
  20. ultralytics/cfg/datasets/VOC.yaml +19 -21
  21. ultralytics/cfg/datasets/VisDrone.yaml +5 -5
  22. ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
  23. ultralytics/cfg/datasets/coco-pose.yaml +24 -2
  24. ultralytics/cfg/datasets/coco.yaml +2 -2
  25. ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
  26. ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
  27. ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
  28. ultralytics/cfg/datasets/dog-pose.yaml +28 -0
  29. ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
  30. ultralytics/cfg/datasets/dota8.yaml +2 -2
  31. ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
  32. ultralytics/cfg/datasets/kitti.yaml +27 -0
  33. ultralytics/cfg/datasets/lvis.yaml +7 -7
  34. ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
  35. ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
  36. ultralytics/cfg/datasets/xView.yaml +16 -16
  37. ultralytics/cfg/default.yaml +96 -94
  38. ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
  39. ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
  40. ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
  41. ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
  42. ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
  43. ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
  44. ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
  45. ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
  46. ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
  47. ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
  48. ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
  49. ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
  50. ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
  51. ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
  52. ultralytics/cfg/models/v6/yolov6.yaml +1 -1
  53. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
  54. ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
  55. ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
  56. ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
  57. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
  58. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
  59. ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
  60. ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
  61. ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
  62. ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
  63. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
  64. ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
  65. ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
  66. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
  67. ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
  68. ultralytics/cfg/trackers/botsort.yaml +16 -17
  69. ultralytics/cfg/trackers/bytetrack.yaml +9 -11
  70. ultralytics/data/__init__.py +4 -4
  71. ultralytics/data/annotator.py +3 -4
  72. ultralytics/data/augment.py +286 -476
  73. ultralytics/data/base.py +18 -26
  74. ultralytics/data/build.py +151 -26
  75. ultralytics/data/converter.py +38 -50
  76. ultralytics/data/dataset.py +47 -75
  77. ultralytics/data/loaders.py +42 -49
  78. ultralytics/data/split.py +5 -6
  79. ultralytics/data/split_dota.py +8 -15
  80. ultralytics/data/utils.py +41 -45
  81. ultralytics/engine/exporter.py +462 -462
  82. ultralytics/engine/model.py +150 -191
  83. ultralytics/engine/predictor.py +30 -40
  84. ultralytics/engine/results.py +177 -311
  85. ultralytics/engine/trainer.py +193 -120
  86. ultralytics/engine/tuner.py +77 -63
  87. ultralytics/engine/validator.py +39 -22
  88. ultralytics/hub/__init__.py +16 -19
  89. ultralytics/hub/auth.py +6 -12
  90. ultralytics/hub/google/__init__.py +7 -10
  91. ultralytics/hub/session.py +15 -25
  92. ultralytics/hub/utils.py +5 -8
  93. ultralytics/models/__init__.py +1 -1
  94. ultralytics/models/fastsam/__init__.py +1 -1
  95. ultralytics/models/fastsam/model.py +8 -10
  96. ultralytics/models/fastsam/predict.py +19 -30
  97. ultralytics/models/fastsam/utils.py +1 -2
  98. ultralytics/models/fastsam/val.py +5 -7
  99. ultralytics/models/nas/__init__.py +1 -1
  100. ultralytics/models/nas/model.py +5 -8
  101. ultralytics/models/nas/predict.py +7 -9
  102. ultralytics/models/nas/val.py +1 -2
  103. ultralytics/models/rtdetr/__init__.py +1 -1
  104. ultralytics/models/rtdetr/model.py +7 -8
  105. ultralytics/models/rtdetr/predict.py +15 -19
  106. ultralytics/models/rtdetr/train.py +10 -13
  107. ultralytics/models/rtdetr/val.py +21 -23
  108. ultralytics/models/sam/__init__.py +15 -2
  109. ultralytics/models/sam/amg.py +14 -20
  110. ultralytics/models/sam/build.py +26 -19
  111. ultralytics/models/sam/build_sam3.py +377 -0
  112. ultralytics/models/sam/model.py +29 -32
  113. ultralytics/models/sam/modules/blocks.py +83 -144
  114. ultralytics/models/sam/modules/decoders.py +22 -40
  115. ultralytics/models/sam/modules/encoders.py +44 -101
  116. ultralytics/models/sam/modules/memory_attention.py +16 -30
  117. ultralytics/models/sam/modules/sam.py +206 -79
  118. ultralytics/models/sam/modules/tiny_encoder.py +64 -83
  119. ultralytics/models/sam/modules/transformer.py +18 -28
  120. ultralytics/models/sam/modules/utils.py +174 -50
  121. ultralytics/models/sam/predict.py +2268 -366
  122. ultralytics/models/sam/sam3/__init__.py +3 -0
  123. ultralytics/models/sam/sam3/decoder.py +546 -0
  124. ultralytics/models/sam/sam3/encoder.py +529 -0
  125. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  126. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  127. ultralytics/models/sam/sam3/model_misc.py +199 -0
  128. ultralytics/models/sam/sam3/necks.py +129 -0
  129. ultralytics/models/sam/sam3/sam3_image.py +339 -0
  130. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  131. ultralytics/models/sam/sam3/vitdet.py +547 -0
  132. ultralytics/models/sam/sam3/vl_combiner.py +160 -0
  133. ultralytics/models/utils/loss.py +14 -26
  134. ultralytics/models/utils/ops.py +13 -17
  135. ultralytics/models/yolo/__init__.py +1 -1
  136. ultralytics/models/yolo/classify/predict.py +9 -12
  137. ultralytics/models/yolo/classify/train.py +15 -41
  138. ultralytics/models/yolo/classify/val.py +34 -32
  139. ultralytics/models/yolo/detect/predict.py +8 -11
  140. ultralytics/models/yolo/detect/train.py +13 -32
  141. ultralytics/models/yolo/detect/val.py +75 -63
  142. ultralytics/models/yolo/model.py +37 -53
  143. ultralytics/models/yolo/obb/predict.py +5 -14
  144. ultralytics/models/yolo/obb/train.py +11 -14
  145. ultralytics/models/yolo/obb/val.py +42 -39
  146. ultralytics/models/yolo/pose/__init__.py +1 -1
  147. ultralytics/models/yolo/pose/predict.py +7 -22
  148. ultralytics/models/yolo/pose/train.py +10 -22
  149. ultralytics/models/yolo/pose/val.py +40 -59
  150. ultralytics/models/yolo/segment/predict.py +16 -20
  151. ultralytics/models/yolo/segment/train.py +3 -12
  152. ultralytics/models/yolo/segment/val.py +106 -56
  153. ultralytics/models/yolo/world/train.py +12 -16
  154. ultralytics/models/yolo/world/train_world.py +11 -34
  155. ultralytics/models/yolo/yoloe/__init__.py +7 -7
  156. ultralytics/models/yolo/yoloe/predict.py +16 -23
  157. ultralytics/models/yolo/yoloe/train.py +31 -56
  158. ultralytics/models/yolo/yoloe/train_seg.py +5 -10
  159. ultralytics/models/yolo/yoloe/val.py +16 -21
  160. ultralytics/nn/__init__.py +7 -7
  161. ultralytics/nn/autobackend.py +152 -80
  162. ultralytics/nn/modules/__init__.py +60 -60
  163. ultralytics/nn/modules/activation.py +4 -6
  164. ultralytics/nn/modules/block.py +133 -217
  165. ultralytics/nn/modules/conv.py +52 -97
  166. ultralytics/nn/modules/head.py +64 -116
  167. ultralytics/nn/modules/transformer.py +79 -89
  168. ultralytics/nn/modules/utils.py +16 -21
  169. ultralytics/nn/tasks.py +111 -156
  170. ultralytics/nn/text_model.py +40 -67
  171. ultralytics/solutions/__init__.py +12 -12
  172. ultralytics/solutions/ai_gym.py +11 -17
  173. ultralytics/solutions/analytics.py +15 -16
  174. ultralytics/solutions/config.py +5 -6
  175. ultralytics/solutions/distance_calculation.py +10 -13
  176. ultralytics/solutions/heatmap.py +7 -13
  177. ultralytics/solutions/instance_segmentation.py +5 -8
  178. ultralytics/solutions/object_blurrer.py +7 -10
  179. ultralytics/solutions/object_counter.py +12 -19
  180. ultralytics/solutions/object_cropper.py +8 -14
  181. ultralytics/solutions/parking_management.py +33 -31
  182. ultralytics/solutions/queue_management.py +10 -12
  183. ultralytics/solutions/region_counter.py +9 -12
  184. ultralytics/solutions/security_alarm.py +15 -20
  185. ultralytics/solutions/similarity_search.py +13 -17
  186. ultralytics/solutions/solutions.py +75 -74
  187. ultralytics/solutions/speed_estimation.py +7 -10
  188. ultralytics/solutions/streamlit_inference.py +4 -7
  189. ultralytics/solutions/templates/similarity-search.html +7 -18
  190. ultralytics/solutions/trackzone.py +7 -10
  191. ultralytics/solutions/vision_eye.py +5 -8
  192. ultralytics/trackers/__init__.py +1 -1
  193. ultralytics/trackers/basetrack.py +3 -5
  194. ultralytics/trackers/bot_sort.py +10 -27
  195. ultralytics/trackers/byte_tracker.py +14 -30
  196. ultralytics/trackers/track.py +3 -6
  197. ultralytics/trackers/utils/gmc.py +11 -22
  198. ultralytics/trackers/utils/kalman_filter.py +37 -48
  199. ultralytics/trackers/utils/matching.py +12 -15
  200. ultralytics/utils/__init__.py +116 -116
  201. ultralytics/utils/autobatch.py +2 -4
  202. ultralytics/utils/autodevice.py +17 -18
  203. ultralytics/utils/benchmarks.py +70 -70
  204. ultralytics/utils/callbacks/base.py +8 -10
  205. ultralytics/utils/callbacks/clearml.py +5 -13
  206. ultralytics/utils/callbacks/comet.py +32 -46
  207. ultralytics/utils/callbacks/dvc.py +13 -18
  208. ultralytics/utils/callbacks/mlflow.py +4 -5
  209. ultralytics/utils/callbacks/neptune.py +7 -15
  210. ultralytics/utils/callbacks/platform.py +314 -38
  211. ultralytics/utils/callbacks/raytune.py +3 -4
  212. ultralytics/utils/callbacks/tensorboard.py +23 -31
  213. ultralytics/utils/callbacks/wb.py +10 -13
  214. ultralytics/utils/checks.py +151 -87
  215. ultralytics/utils/cpu.py +3 -8
  216. ultralytics/utils/dist.py +19 -15
  217. ultralytics/utils/downloads.py +29 -41
  218. ultralytics/utils/errors.py +6 -14
  219. ultralytics/utils/events.py +2 -4
  220. ultralytics/utils/export/__init__.py +7 -0
  221. ultralytics/utils/{export.py → export/engine.py} +16 -16
  222. ultralytics/utils/export/imx.py +325 -0
  223. ultralytics/utils/export/tensorflow.py +231 -0
  224. ultralytics/utils/files.py +24 -28
  225. ultralytics/utils/git.py +9 -11
  226. ultralytics/utils/instance.py +30 -51
  227. ultralytics/utils/logger.py +212 -114
  228. ultralytics/utils/loss.py +15 -24
  229. ultralytics/utils/metrics.py +131 -160
  230. ultralytics/utils/nms.py +21 -30
  231. ultralytics/utils/ops.py +107 -165
  232. ultralytics/utils/patches.py +33 -21
  233. ultralytics/utils/plotting.py +122 -119
  234. ultralytics/utils/tal.py +28 -44
  235. ultralytics/utils/torch_utils.py +70 -187
  236. ultralytics/utils/tqdm.py +20 -20
  237. ultralytics/utils/triton.py +13 -19
  238. ultralytics/utils/tuner.py +17 -5
  239. dgenerate_ultralytics_headless-8.3.196.dist-info/RECORD +0 -281
  240. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/WHEEL +0 -0
  241. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/entry_points.txt +0 -0
  242. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/licenses/LICENSE +0 -0
  243. {dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/top_level.txt +0 -0
@@ -22,12 +22,11 @@ from ultralytics.utils.instance import to_2tuple
22
22
 
23
23
 
24
24
  class Conv2d_BN(torch.nn.Sequential):
25
- """
26
- A sequential container that performs 2D convolution followed by batch normalization.
25
+ """A sequential container that performs 2D convolution followed by batch normalization.
27
26
 
28
- This module combines a 2D convolution layer with batch normalization, providing a common building block
29
- for convolutional neural networks. The batch normalization weights and biases are initialized to specific
30
- values for optimal training performance.
27
+ This module combines a 2D convolution layer with batch normalization, providing a common building block for
28
+ convolutional neural networks. The batch normalization weights and biases are initialized to specific values for
29
+ optimal training performance.
31
30
 
32
31
  Attributes:
33
32
  c (torch.nn.Conv2d): 2D convolution layer.
@@ -52,8 +51,7 @@ class Conv2d_BN(torch.nn.Sequential):
52
51
  groups: int = 1,
53
52
  bn_weight_init: float = 1,
54
53
  ):
55
- """
56
- Initialize a sequential container with 2D convolution followed by batch normalization.
54
+ """Initialize a sequential container with 2D convolution followed by batch normalization.
57
55
 
58
56
  Args:
59
57
  a (int): Number of input channels.
@@ -74,11 +72,10 @@ class Conv2d_BN(torch.nn.Sequential):
74
72
 
75
73
 
76
74
  class PatchEmbed(nn.Module):
77
- """
78
- Embed images into patches and project them into a specified embedding dimension.
75
+ """Embed images into patches and project them into a specified embedding dimension.
79
76
 
80
- This module converts input images into patch embeddings using a sequence of convolutional layers,
81
- effectively downsampling the spatial dimensions while increasing the channel dimension.
77
+ This module converts input images into patch embeddings using a sequence of convolutional layers, effectively
78
+ downsampling the spatial dimensions while increasing the channel dimension.
82
79
 
83
80
  Attributes:
84
81
  patches_resolution (tuple[int, int]): Resolution of the patches after embedding.
@@ -97,8 +94,7 @@ class PatchEmbed(nn.Module):
97
94
  """
98
95
 
99
96
  def __init__(self, in_chans: int, embed_dim: int, resolution: int, activation):
100
- """
101
- Initialize patch embedding with convolutional layers for image-to-patch conversion and projection.
97
+ """Initialize patch embedding with convolutional layers for image-to-patch conversion and projection.
102
98
 
103
99
  Args:
104
100
  in_chans (int): Number of input channels.
@@ -125,11 +121,10 @@ class PatchEmbed(nn.Module):
125
121
 
126
122
 
127
123
  class MBConv(nn.Module):
128
- """
129
- Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.
124
+ """Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.
130
125
 
131
- This module implements the mobile inverted bottleneck convolution with expansion, depthwise convolution,
132
- and projection phases, along with residual connections for improved gradient flow.
126
+ This module implements the mobile inverted bottleneck convolution with expansion, depthwise convolution, and
127
+ projection phases, along with residual connections for improved gradient flow.
133
128
 
134
129
  Attributes:
135
130
  in_chans (int): Number of input channels.
@@ -153,8 +148,7 @@ class MBConv(nn.Module):
153
148
  """
154
149
 
155
150
  def __init__(self, in_chans: int, out_chans: int, expand_ratio: float, activation, drop_path: float):
156
- """
157
- Initialize the MBConv layer with specified input/output channels, expansion ratio, and activation.
151
+ """Initialize the MBConv layer with specified input/output channels, expansion ratio, and activation.
158
152
 
159
153
  Args:
160
154
  in_chans (int): Number of input channels.
@@ -195,12 +189,11 @@ class MBConv(nn.Module):
195
189
 
196
190
 
197
191
  class PatchMerging(nn.Module):
198
- """
199
- Merge neighboring patches in the feature map and project to a new dimension.
192
+ """Merge neighboring patches in the feature map and project to a new dimension.
200
193
 
201
- This class implements a patch merging operation that combines spatial information and adjusts the feature
202
- dimension using a series of convolutional layers with batch normalization. It effectively reduces spatial
203
- resolution while potentially increasing channel dimensions.
194
+ This class implements a patch merging operation that combines spatial information and adjusts the feature dimension
195
+ using a series of convolutional layers with batch normalization. It effectively reduces spatial resolution while
196
+ potentially increasing channel dimensions.
204
197
 
205
198
  Attributes:
206
199
  input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
@@ -221,8 +214,7 @@ class PatchMerging(nn.Module):
221
214
  """
222
215
 
223
216
  def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
224
- """
225
- Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
217
+ """Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
226
218
 
227
219
  Args:
228
220
  input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
@@ -259,11 +251,10 @@ class PatchMerging(nn.Module):
259
251
 
260
252
 
261
253
  class ConvLayer(nn.Module):
262
- """
263
- Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
254
+ """Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
264
255
 
265
- This layer optionally applies downsample operations to the output and supports gradient checkpointing
266
- for memory efficiency during training.
256
+ This layer optionally applies downsample operations to the output and supports gradient checkpointing for memory
257
+ efficiency during training.
267
258
 
268
259
  Attributes:
269
260
  dim (int): Dimensionality of the input and output.
@@ -293,11 +284,10 @@ class ConvLayer(nn.Module):
293
284
  out_dim: int | None = None,
294
285
  conv_expand_ratio: float = 4.0,
295
286
  ):
296
- """
297
- Initialize the ConvLayer with the given dimensions and settings.
287
+ """Initialize the ConvLayer with the given dimensions and settings.
298
288
 
299
- This layer consists of multiple MobileNetV3-style inverted bottleneck convolutions (MBConv) and
300
- optionally applies downsampling to the output.
289
+ This layer consists of multiple MobileNetV3-style inverted bottleneck convolutions (MBConv) and optionally
290
+ applies downsampling to the output.
301
291
 
302
292
  Args:
303
293
  dim (int): The dimensionality of the input and output.
@@ -307,7 +297,7 @@ class ConvLayer(nn.Module):
307
297
  drop_path (float | list[float], optional): Drop path rate. Single float or a list of floats for each MBConv.
308
298
  downsample (Optional[nn.Module], optional): Function for downsampling the output. None to skip downsampling.
309
299
  use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
310
- out_dim (Optional[int], optional): The dimensionality of the output. None means it will be the same as `dim`.
300
+ out_dim (Optional[int], optional): Output dimensions. None means it will be the same as `dim`.
311
301
  conv_expand_ratio (float, optional): Expansion ratio for the MBConv layers.
312
302
  """
313
303
  super().__init__()
@@ -345,11 +335,10 @@ class ConvLayer(nn.Module):
345
335
 
346
336
 
347
337
  class MLP(nn.Module):
348
- """
349
- Multi-layer Perceptron (MLP) module for transformer architectures.
338
+ """Multi-layer Perceptron (MLP) module for transformer architectures.
350
339
 
351
- This module applies layer normalization, two fully-connected layers with an activation function in between,
352
- and dropout. It is commonly used in transformer-based architectures for processing token embeddings.
340
+ This module applies layer normalization, two fully-connected layers with an activation function in between, and
341
+ dropout. It is commonly used in transformer-based architectures for processing token embeddings.
353
342
 
354
343
  Attributes:
355
344
  norm (nn.LayerNorm): Layer normalization applied to the input.
@@ -376,8 +365,7 @@ class MLP(nn.Module):
376
365
  activation=nn.GELU,
377
366
  drop: float = 0.0,
378
367
  ):
379
- """
380
- Initialize a multi-layer perceptron with configurable input, hidden, and output dimensions.
368
+ """Initialize a multi-layer perceptron with configurable input, hidden, and output dimensions.
381
369
 
382
370
  Args:
383
371
  in_features (int): Number of input features.
@@ -406,12 +394,11 @@ class MLP(nn.Module):
406
394
 
407
395
 
408
396
  class Attention(torch.nn.Module):
409
- """
410
- Multi-head attention module with spatial awareness and trainable attention biases.
397
+ """Multi-head attention module with spatial awareness and trainable attention biases.
411
398
 
412
- This module implements a multi-head attention mechanism with support for spatial awareness, applying
413
- attention biases based on spatial resolution. It includes trainable attention biases for each unique
414
- offset between spatial positions in the resolution grid.
399
+ This module implements a multi-head attention mechanism with support for spatial awareness, applying attention
400
+ biases based on spatial resolution. It includes trainable attention biases for each unique offset between spatial
401
+ positions in the resolution grid.
415
402
 
416
403
  Attributes:
417
404
  num_heads (int): Number of attention heads.
@@ -444,12 +431,11 @@ class Attention(torch.nn.Module):
444
431
  attn_ratio: float = 4,
445
432
  resolution: tuple[int, int] = (14, 14),
446
433
  ):
447
- """
448
- Initialize the Attention module for multi-head attention with spatial awareness.
434
+ """Initialize the Attention module for multi-head attention with spatial awareness.
449
435
 
450
- This module implements a multi-head attention mechanism with support for spatial awareness, applying
451
- attention biases based on spatial resolution. It includes trainable attention biases for each unique
452
- offset between spatial positions in the resolution grid.
436
+ This module implements a multi-head attention mechanism with support for spatial awareness, applying attention
437
+ biases based on spatial resolution. It includes trainable attention biases for each unique offset between
438
+ spatial positions in the resolution grid.
453
439
 
454
440
  Args:
455
441
  dim (int): The dimensionality of the input and output.
@@ -521,12 +507,11 @@ class Attention(torch.nn.Module):
521
507
 
522
508
 
523
509
  class TinyViTBlock(nn.Module):
524
- """
525
- TinyViT Block that applies self-attention and a local convolution to the input.
510
+ """TinyViT Block that applies self-attention and a local convolution to the input.
526
511
 
527
- This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
528
- local convolutions to process input features efficiently. It supports windowed attention for
529
- computational efficiency and includes residual connections.
512
+ This block is a key component of the TinyViT architecture, combining self-attention mechanisms with local
513
+ convolutions to process input features efficiently. It supports windowed attention for computational efficiency and
514
+ includes residual connections.
530
515
 
531
516
  Attributes:
532
517
  dim (int): The dimensionality of the input and output.
@@ -559,11 +544,10 @@ class TinyViTBlock(nn.Module):
559
544
  local_conv_size: int = 3,
560
545
  activation=nn.GELU,
561
546
  ):
562
- """
563
- Initialize a TinyViT block with self-attention and local convolution.
547
+ """Initialize a TinyViT block with self-attention and local convolution.
564
548
 
565
- This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
566
- local convolutions to process input features efficiently.
549
+ This block is a key component of the TinyViT architecture, combining self-attention mechanisms with local
550
+ convolutions to process input features efficiently.
567
551
 
568
552
  Args:
569
553
  dim (int): Dimensionality of the input and output features.
@@ -644,8 +628,7 @@ class TinyViTBlock(nn.Module):
644
628
  return x + self.drop_path(self.mlp(x))
645
629
 
646
630
  def extra_repr(self) -> str:
647
- """
648
- Return a string representation of the TinyViTBlock's parameters.
631
+ """Return a string representation of the TinyViTBlock's parameters.
649
632
 
650
633
  This method provides a formatted string containing key information about the TinyViTBlock, including its
651
634
  dimension, input resolution, number of attention heads, window size, and MLP ratio.
@@ -665,12 +648,11 @@ class TinyViTBlock(nn.Module):
665
648
 
666
649
 
667
650
  class BasicLayer(nn.Module):
668
- """
669
- A basic TinyViT layer for one stage in a TinyViT architecture.
651
+ """A basic TinyViT layer for one stage in a TinyViT architecture.
670
652
 
671
- This class represents a single layer in the TinyViT model, consisting of multiple TinyViT blocks
672
- and an optional downsampling operation. It processes features at a specific resolution and
673
- dimensionality within the overall architecture.
653
+ This class represents a single layer in the TinyViT model, consisting of multiple TinyViT blocks and an optional
654
+ downsampling operation. It processes features at a specific resolution and dimensionality within the overall
655
+ architecture.
674
656
 
675
657
  Attributes:
676
658
  dim (int): The dimensionality of the input and output features.
@@ -704,11 +686,10 @@ class BasicLayer(nn.Module):
704
686
  activation=nn.GELU,
705
687
  out_dim: int | None = None,
706
688
  ):
707
- """
708
- Initialize a BasicLayer in the TinyViT architecture.
689
+ """Initialize a BasicLayer in the TinyViT architecture.
709
690
 
710
- This layer consists of multiple TinyViT blocks and an optional downsampling operation. It is designed to
711
- process feature maps at a specific resolution and dimensionality within the TinyViT model.
691
+ This layer consists of multiple TinyViT blocks and an optional downsampling operation. It is designed to process
692
+ feature maps at a specific resolution and dimensionality within the TinyViT model.
712
693
 
713
694
  Args:
714
695
  dim (int): Dimensionality of the input and output features.
@@ -718,12 +699,14 @@ class BasicLayer(nn.Module):
718
699
  window_size (int): Size of the local window for attention computation.
719
700
  mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
720
701
  drop (float, optional): Dropout rate.
721
- drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for each block.
722
- downsample (nn.Module | None, optional): Downsampling layer at the end of the layer. None to skip downsampling.
702
+ drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for
703
+ each block.
704
+ downsample (nn.Module | None, optional): Downsampling layer at the end of the layer. None to skip
705
+ downsampling.
723
706
  use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
724
707
  local_conv_size (int, optional): Kernel size for the local convolution in each TinyViT block.
725
708
  activation (nn.Module): Activation function used in the MLP.
726
- out_dim (int | None, optional): Output dimension after downsampling. None means it will be the same as `dim`.
709
+ out_dim (int | None, optional): Output dimension after downsampling. None means it will be the same as dim.
727
710
  """
728
711
  super().__init__()
729
712
  self.dim = dim
@@ -768,12 +751,11 @@ class BasicLayer(nn.Module):
768
751
 
769
752
 
770
753
  class TinyViT(nn.Module):
771
- """
772
- TinyViT: A compact vision transformer architecture for efficient image classification and feature extraction.
754
+ """TinyViT: A compact vision transformer architecture for efficient image classification and feature extraction.
773
755
 
774
- This class implements the TinyViT model, which combines elements of vision transformers and convolutional
775
- neural networks for improved efficiency and performance on vision tasks. It features hierarchical processing
776
- with patch embedding, multiple stages of attention and convolution blocks, and a feature refinement neck.
756
+ This class implements the TinyViT model, which combines elements of vision transformers and convolutional neural
757
+ networks for improved efficiency and performance on vision tasks. It features hierarchical processing with patch
758
+ embedding, multiple stages of attention and convolution blocks, and a feature refinement neck.
777
759
 
778
760
  Attributes:
779
761
  img_size (int): Input image size.
@@ -813,11 +795,10 @@ class TinyViT(nn.Module):
813
795
  local_conv_size: int = 3,
814
796
  layer_lr_decay: float = 1.0,
815
797
  ):
816
- """
817
- Initialize the TinyViT model.
798
+ """Initialize the TinyViT model.
818
799
 
819
- This constructor sets up the TinyViT architecture, including patch embedding, multiple layers of
820
- attention and convolution blocks, and a classification head.
800
+ This constructor sets up the TinyViT architecture, including patch embedding, multiple layers of attention and
801
+ convolution blocks, and a classification head.
821
802
 
822
803
  Args:
823
804
  img_size (int, optional): Size of the input image.
@@ -11,12 +11,10 @@ from ultralytics.nn.modules import MLPBlock
11
11
 
12
12
 
13
13
  class TwoWayTransformer(nn.Module):
14
- """
15
- A Two-Way Transformer module for simultaneous attention to image and query points.
14
+ """A Two-Way Transformer module for simultaneous attention to image and query points.
16
15
 
17
- This class implements a specialized transformer decoder that attends to an input image using queries with
18
- supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
19
- cloud processing.
16
+ This class implements a specialized transformer decoder that attends to an input image using queries with supplied
17
+ positional embeddings. It's useful for tasks like object detection, image segmentation, and point cloud processing.
20
18
 
21
19
  Attributes:
22
20
  depth (int): Number of layers in the transformer.
@@ -48,8 +46,7 @@ class TwoWayTransformer(nn.Module):
48
46
  activation: type[nn.Module] = nn.ReLU,
49
47
  attention_downsample_rate: int = 2,
50
48
  ) -> None:
51
- """
52
- Initialize a Two-Way Transformer for simultaneous attention to image and query points.
49
+ """Initialize a Two-Way Transformer for simultaneous attention to image and query points.
53
50
 
54
51
  Args:
55
52
  depth (int): Number of layers in the transformer.
@@ -87,8 +84,7 @@ class TwoWayTransformer(nn.Module):
87
84
  image_pe: torch.Tensor,
88
85
  point_embedding: torch.Tensor,
89
86
  ) -> tuple[torch.Tensor, torch.Tensor]:
90
- """
91
- Process image and point embeddings through the Two-Way Transformer.
87
+ """Process image and point embeddings through the Two-Way Transformer.
92
88
 
93
89
  Args:
94
90
  image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
@@ -127,12 +123,11 @@ class TwoWayTransformer(nn.Module):
127
123
 
128
124
 
129
125
  class TwoWayAttentionBlock(nn.Module):
130
- """
131
- A two-way attention block for simultaneous attention to image and query points.
126
+ """A two-way attention block for simultaneous attention to image and query points.
132
127
 
133
128
  This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
134
- cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
135
- inputs to sparse inputs.
129
+ cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense inputs to
130
+ sparse inputs.
136
131
 
137
132
  Attributes:
138
133
  self_attn (Attention): Self-attention layer for queries.
@@ -167,12 +162,11 @@ class TwoWayAttentionBlock(nn.Module):
167
162
  attention_downsample_rate: int = 2,
168
163
  skip_first_layer_pe: bool = False,
169
164
  ) -> None:
170
- """
171
- Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.
165
+ """Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.
172
166
 
173
167
  This block implements a specialized transformer layer with four main components: self-attention on sparse
174
- inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
175
- of dense inputs to sparse inputs.
168
+ inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of
169
+ dense inputs to sparse inputs.
176
170
 
177
171
  Args:
178
172
  embedding_dim (int): Channel dimension of the embeddings.
@@ -200,8 +194,7 @@ class TwoWayAttentionBlock(nn.Module):
200
194
  def forward(
201
195
  self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
202
196
  ) -> tuple[torch.Tensor, torch.Tensor]:
203
- """
204
- Apply two-way attention to process query and key embeddings in a transformer block.
197
+ """Apply two-way attention to process query and key embeddings in a transformer block.
205
198
 
206
199
  Args:
207
200
  queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
@@ -245,11 +238,10 @@ class TwoWayAttentionBlock(nn.Module):
245
238
 
246
239
 
247
240
  class Attention(nn.Module):
248
- """
249
- An attention layer with downscaling capability for embedding size after projection.
241
+ """An attention layer with downscaling capability for embedding size after projection.
250
242
 
251
- This class implements a multi-head attention mechanism with the option to downsample the internal
252
- dimension of queries, keys, and values.
243
+ This class implements a multi-head attention mechanism with the option to downsample the internal dimension of
244
+ queries, keys, and values.
253
245
 
254
246
  Attributes:
255
247
  embedding_dim (int): Dimensionality of input embeddings.
@@ -280,10 +272,9 @@ class Attention(nn.Module):
280
272
  embedding_dim: int,
281
273
  num_heads: int,
282
274
  downsample_rate: int = 1,
283
- kv_in_dim: int = None,
275
+ kv_in_dim: int | None = None,
284
276
  ) -> None:
285
- """
286
- Initialize the Attention module with specified dimensions and settings.
277
+ """Initialize the Attention module with specified dimensions and settings.
287
278
 
288
279
  Args:
289
280
  embedding_dim (int): Dimensionality of input embeddings.
@@ -321,8 +312,7 @@ class Attention(nn.Module):
321
312
  return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
322
313
 
323
314
  def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
324
- """
325
- Apply multi-head attention to query, key, and value tensors with optional downsampling.
315
+ """Apply multi-head attention to query, key, and value tensors with optional downsampling.
326
316
 
327
317
  Args:
328
318
  q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).