dgenerate-ultralytics-headless 8.3.222__py3-none-any.whl → 8.3.225__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.222.dist-info → dgenerate_ultralytics_headless-8.3.225.dist-info}/METADATA +2 -2
- dgenerate_ultralytics_headless-8.3.225.dist-info/RECORD +286 -0
- tests/conftest.py +5 -8
- tests/test_cli.py +1 -8
- tests/test_python.py +1 -2
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +34 -49
- ultralytics/cfg/datasets/ImageNet.yaml +1 -1
- ultralytics/cfg/datasets/kitti.yaml +27 -0
- ultralytics/cfg/datasets/lvis.yaml +5 -5
- ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
- ultralytics/data/annotator.py +3 -4
- ultralytics/data/augment.py +244 -323
- ultralytics/data/base.py +12 -22
- ultralytics/data/build.py +47 -40
- ultralytics/data/converter.py +32 -42
- ultralytics/data/dataset.py +43 -71
- ultralytics/data/loaders.py +22 -34
- ultralytics/data/split.py +5 -6
- ultralytics/data/split_dota.py +8 -15
- ultralytics/data/utils.py +27 -36
- ultralytics/engine/exporter.py +49 -116
- ultralytics/engine/model.py +144 -180
- ultralytics/engine/predictor.py +18 -29
- ultralytics/engine/results.py +165 -231
- ultralytics/engine/trainer.py +11 -19
- ultralytics/engine/tuner.py +13 -23
- ultralytics/engine/validator.py +6 -10
- ultralytics/hub/__init__.py +7 -12
- ultralytics/hub/auth.py +6 -12
- ultralytics/hub/google/__init__.py +7 -10
- ultralytics/hub/session.py +15 -25
- ultralytics/hub/utils.py +3 -6
- ultralytics/models/fastsam/model.py +6 -8
- ultralytics/models/fastsam/predict.py +5 -10
- ultralytics/models/fastsam/utils.py +1 -2
- ultralytics/models/fastsam/val.py +2 -4
- ultralytics/models/nas/model.py +5 -8
- ultralytics/models/nas/predict.py +7 -9
- ultralytics/models/nas/val.py +1 -2
- ultralytics/models/rtdetr/model.py +5 -8
- ultralytics/models/rtdetr/predict.py +15 -18
- ultralytics/models/rtdetr/train.py +10 -13
- ultralytics/models/rtdetr/val.py +13 -20
- ultralytics/models/sam/amg.py +12 -18
- ultralytics/models/sam/build.py +6 -9
- ultralytics/models/sam/model.py +16 -23
- ultralytics/models/sam/modules/blocks.py +62 -84
- ultralytics/models/sam/modules/decoders.py +17 -24
- ultralytics/models/sam/modules/encoders.py +40 -56
- ultralytics/models/sam/modules/memory_attention.py +10 -16
- ultralytics/models/sam/modules/sam.py +41 -47
- ultralytics/models/sam/modules/tiny_encoder.py +64 -83
- ultralytics/models/sam/modules/transformer.py +17 -27
- ultralytics/models/sam/modules/utils.py +31 -42
- ultralytics/models/sam/predict.py +172 -209
- ultralytics/models/utils/loss.py +14 -26
- ultralytics/models/utils/ops.py +13 -17
- ultralytics/models/yolo/classify/predict.py +8 -11
- ultralytics/models/yolo/classify/train.py +8 -16
- ultralytics/models/yolo/classify/val.py +13 -20
- ultralytics/models/yolo/detect/predict.py +4 -8
- ultralytics/models/yolo/detect/train.py +11 -20
- ultralytics/models/yolo/detect/val.py +38 -48
- ultralytics/models/yolo/model.py +35 -47
- ultralytics/models/yolo/obb/predict.py +5 -8
- ultralytics/models/yolo/obb/train.py +11 -14
- ultralytics/models/yolo/obb/val.py +20 -28
- ultralytics/models/yolo/pose/predict.py +5 -8
- ultralytics/models/yolo/pose/train.py +4 -8
- ultralytics/models/yolo/pose/val.py +31 -39
- ultralytics/models/yolo/segment/predict.py +9 -14
- ultralytics/models/yolo/segment/train.py +3 -6
- ultralytics/models/yolo/segment/val.py +16 -26
- ultralytics/models/yolo/world/train.py +8 -14
- ultralytics/models/yolo/world/train_world.py +11 -16
- ultralytics/models/yolo/yoloe/predict.py +16 -23
- ultralytics/models/yolo/yoloe/train.py +30 -43
- ultralytics/models/yolo/yoloe/train_seg.py +5 -10
- ultralytics/models/yolo/yoloe/val.py +15 -20
- ultralytics/nn/autobackend.py +10 -18
- ultralytics/nn/modules/activation.py +4 -6
- ultralytics/nn/modules/block.py +99 -185
- ultralytics/nn/modules/conv.py +45 -90
- ultralytics/nn/modules/head.py +44 -98
- ultralytics/nn/modules/transformer.py +44 -76
- ultralytics/nn/modules/utils.py +14 -19
- ultralytics/nn/tasks.py +86 -146
- ultralytics/nn/text_model.py +25 -40
- ultralytics/solutions/ai_gym.py +10 -16
- ultralytics/solutions/analytics.py +7 -10
- ultralytics/solutions/config.py +4 -5
- ultralytics/solutions/distance_calculation.py +9 -12
- ultralytics/solutions/heatmap.py +7 -13
- ultralytics/solutions/instance_segmentation.py +5 -8
- ultralytics/solutions/object_blurrer.py +7 -10
- ultralytics/solutions/object_counter.py +8 -12
- ultralytics/solutions/object_cropper.py +5 -8
- ultralytics/solutions/parking_management.py +12 -14
- ultralytics/solutions/queue_management.py +4 -6
- ultralytics/solutions/region_counter.py +7 -10
- ultralytics/solutions/security_alarm.py +14 -19
- ultralytics/solutions/similarity_search.py +7 -12
- ultralytics/solutions/solutions.py +31 -53
- ultralytics/solutions/speed_estimation.py +6 -9
- ultralytics/solutions/streamlit_inference.py +2 -4
- ultralytics/solutions/trackzone.py +7 -10
- ultralytics/solutions/vision_eye.py +5 -8
- ultralytics/trackers/basetrack.py +2 -4
- ultralytics/trackers/bot_sort.py +6 -11
- ultralytics/trackers/byte_tracker.py +10 -15
- ultralytics/trackers/track.py +3 -6
- ultralytics/trackers/utils/gmc.py +6 -12
- ultralytics/trackers/utils/kalman_filter.py +35 -43
- ultralytics/trackers/utils/matching.py +6 -10
- ultralytics/utils/__init__.py +61 -100
- ultralytics/utils/autobatch.py +2 -4
- ultralytics/utils/autodevice.py +11 -13
- ultralytics/utils/benchmarks.py +25 -35
- ultralytics/utils/callbacks/base.py +8 -10
- ultralytics/utils/callbacks/clearml.py +2 -4
- ultralytics/utils/callbacks/comet.py +30 -44
- ultralytics/utils/callbacks/dvc.py +13 -18
- ultralytics/utils/callbacks/mlflow.py +4 -5
- ultralytics/utils/callbacks/neptune.py +4 -6
- ultralytics/utils/callbacks/raytune.py +3 -4
- ultralytics/utils/callbacks/tensorboard.py +4 -6
- ultralytics/utils/callbacks/wb.py +10 -13
- ultralytics/utils/checks.py +29 -56
- ultralytics/utils/cpu.py +1 -2
- ultralytics/utils/dist.py +8 -12
- ultralytics/utils/downloads.py +17 -27
- ultralytics/utils/errors.py +6 -8
- ultralytics/utils/events.py +2 -4
- ultralytics/utils/export/__init__.py +4 -239
- ultralytics/utils/export/engine.py +237 -0
- ultralytics/utils/export/imx.py +11 -17
- ultralytics/utils/export/tensorflow.py +217 -0
- ultralytics/utils/files.py +10 -15
- ultralytics/utils/git.py +5 -7
- ultralytics/utils/instance.py +30 -51
- ultralytics/utils/logger.py +11 -15
- ultralytics/utils/loss.py +8 -14
- ultralytics/utils/metrics.py +98 -138
- ultralytics/utils/nms.py +13 -16
- ultralytics/utils/ops.py +47 -74
- ultralytics/utils/patches.py +11 -18
- ultralytics/utils/plotting.py +29 -42
- ultralytics/utils/tal.py +25 -39
- ultralytics/utils/torch_utils.py +45 -73
- ultralytics/utils/tqdm.py +6 -8
- ultralytics/utils/triton.py +9 -12
- ultralytics/utils/tuner.py +1 -2
- dgenerate_ultralytics_headless-8.3.222.dist-info/RECORD +0 -283
- {dgenerate_ultralytics_headless-8.3.222.dist-info → dgenerate_ultralytics_headless-8.3.225.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.222.dist-info → dgenerate_ultralytics_headless-8.3.225.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.222.dist-info → dgenerate_ultralytics_headless-8.3.225.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.222.dist-info → dgenerate_ultralytics_headless-8.3.225.dist-info}/top_level.txt +0 -0
|
@@ -17,8 +17,7 @@ from .utils import add_decomposed_rel_pos, apply_rotary_enc, compute_axial_cis,
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class DropPath(nn.Module):
|
|
20
|
-
"""
|
|
21
|
-
Implements stochastic depth regularization for neural networks during training.
|
|
20
|
+
"""Implements stochastic depth regularization for neural networks during training.
|
|
22
21
|
|
|
23
22
|
Attributes:
|
|
24
23
|
drop_prob (float): Probability of dropping a path during training.
|
|
@@ -52,16 +51,14 @@ class DropPath(nn.Module):
|
|
|
52
51
|
|
|
53
52
|
|
|
54
53
|
class MaskDownSampler(nn.Module):
|
|
55
|
-
"""
|
|
56
|
-
A mask downsampling and embedding module for efficient processing of input masks.
|
|
54
|
+
"""A mask downsampling and embedding module for efficient processing of input masks.
|
|
57
55
|
|
|
58
|
-
This class implements a mask downsampler that progressively reduces the spatial dimensions of input masks
|
|
59
|
-
|
|
60
|
-
functions.
|
|
56
|
+
This class implements a mask downsampler that progressively reduces the spatial dimensions of input masks while
|
|
57
|
+
expanding their channel dimensions using convolutional layers, layer normalization, and activation functions.
|
|
61
58
|
|
|
62
59
|
Attributes:
|
|
63
|
-
encoder (nn.Sequential): A sequential container of convolutional layers, layer normalization, and
|
|
64
|
-
|
|
60
|
+
encoder (nn.Sequential): A sequential container of convolutional layers, layer normalization, and activation
|
|
61
|
+
functions for downsampling and embedding masks.
|
|
65
62
|
|
|
66
63
|
Methods:
|
|
67
64
|
forward: Downsamples and encodes input mask to embed_dim channels.
|
|
@@ -112,11 +109,10 @@ class MaskDownSampler(nn.Module):
|
|
|
112
109
|
|
|
113
110
|
|
|
114
111
|
class CXBlock(nn.Module):
|
|
115
|
-
"""
|
|
116
|
-
ConvNeXt Block for efficient feature extraction in convolutional neural networks.
|
|
112
|
+
"""ConvNeXt Block for efficient feature extraction in convolutional neural networks.
|
|
117
113
|
|
|
118
|
-
This block implements a modified version of the ConvNeXt architecture, offering improved performance and
|
|
119
|
-
|
|
114
|
+
This block implements a modified version of the ConvNeXt architecture, offering improved performance and flexibility
|
|
115
|
+
in feature extraction.
|
|
120
116
|
|
|
121
117
|
Attributes:
|
|
122
118
|
dwconv (nn.Conv2d): Depthwise or standard 2D convolution layer.
|
|
@@ -148,8 +144,7 @@ class CXBlock(nn.Module):
|
|
|
148
144
|
layer_scale_init_value: float = 1e-6,
|
|
149
145
|
use_dwconv: bool = True,
|
|
150
146
|
):
|
|
151
|
-
"""
|
|
152
|
-
Initialize a ConvNeXt Block for efficient feature extraction in convolutional neural networks.
|
|
147
|
+
"""Initialize a ConvNeXt Block for efficient feature extraction in convolutional neural networks.
|
|
153
148
|
|
|
154
149
|
This block implements a modified version of the ConvNeXt architecture, offering improved performance and
|
|
155
150
|
flexibility in feature extraction.
|
|
@@ -206,8 +201,7 @@ class CXBlock(nn.Module):
|
|
|
206
201
|
|
|
207
202
|
|
|
208
203
|
class Fuser(nn.Module):
|
|
209
|
-
"""
|
|
210
|
-
A module for fusing features through multiple layers of a neural network.
|
|
204
|
+
"""A module for fusing features through multiple layers of a neural network.
|
|
211
205
|
|
|
212
206
|
This class applies a series of identical layers to an input tensor, optionally projecting the input first.
|
|
213
207
|
|
|
@@ -228,8 +222,7 @@ class Fuser(nn.Module):
|
|
|
228
222
|
"""
|
|
229
223
|
|
|
230
224
|
def __init__(self, layer: nn.Module, num_layers: int, dim: int | None = None, input_projection: bool = False):
|
|
231
|
-
"""
|
|
232
|
-
Initialize the Fuser module for feature fusion through multiple layers.
|
|
225
|
+
"""Initialize the Fuser module for feature fusion through multiple layers.
|
|
233
226
|
|
|
234
227
|
This module creates a sequence of identical layers and optionally applies an input projection.
|
|
235
228
|
|
|
@@ -262,12 +255,11 @@ class Fuser(nn.Module):
|
|
|
262
255
|
|
|
263
256
|
|
|
264
257
|
class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
|
|
265
|
-
"""
|
|
266
|
-
A two-way attention block for performing self-attention and cross-attention in both directions.
|
|
258
|
+
"""A two-way attention block for performing self-attention and cross-attention in both directions.
|
|
267
259
|
|
|
268
|
-
This block extends the TwoWayAttentionBlock and consists of four main components: self-attention on
|
|
269
|
-
|
|
270
|
-
|
|
260
|
+
This block extends the TwoWayAttentionBlock and consists of four main components: self-attention on sparse inputs,
|
|
261
|
+
cross-attention from sparse to dense inputs, an MLP block on sparse inputs, and cross-attention from dense to sparse
|
|
262
|
+
inputs.
|
|
271
263
|
|
|
272
264
|
Attributes:
|
|
273
265
|
self_attn (Attention): Self-attention layer for queries.
|
|
@@ -299,12 +291,11 @@ class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
|
|
|
299
291
|
attention_downsample_rate: int = 2,
|
|
300
292
|
skip_first_layer_pe: bool = False,
|
|
301
293
|
) -> None:
|
|
302
|
-
"""
|
|
303
|
-
Initialize a SAM2TwoWayAttentionBlock for performing self-attention and cross-attention in two directions.
|
|
294
|
+
"""Initialize a SAM2TwoWayAttentionBlock for performing self-attention and cross-attention in two directions.
|
|
304
295
|
|
|
305
296
|
This block extends the TwoWayAttentionBlock and consists of four main components: self-attention on sparse
|
|
306
|
-
inputs, cross-attention from sparse to dense inputs, an MLP block on sparse inputs, and cross-attention
|
|
307
|
-
|
|
297
|
+
inputs, cross-attention from sparse to dense inputs, an MLP block on sparse inputs, and cross-attention from
|
|
298
|
+
dense to sparse inputs.
|
|
308
299
|
|
|
309
300
|
Args:
|
|
310
301
|
embedding_dim (int): The channel dimension of the embeddings.
|
|
@@ -325,12 +316,11 @@ class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
|
|
|
325
316
|
|
|
326
317
|
|
|
327
318
|
class SAM2TwoWayTransformer(TwoWayTransformer):
|
|
328
|
-
"""
|
|
329
|
-
A Two-Way Transformer module for simultaneous attention to image and query points.
|
|
319
|
+
"""A Two-Way Transformer module for simultaneous attention to image and query points.
|
|
330
320
|
|
|
331
|
-
This class extends the TwoWayTransformer, implementing a specialized transformer decoder that attends to an
|
|
332
|
-
|
|
333
|
-
|
|
321
|
+
This class extends the TwoWayTransformer, implementing a specialized transformer decoder that attends to an input
|
|
322
|
+
image using queries with supplied positional embeddings. It is particularly useful for tasks like object detection,
|
|
323
|
+
image segmentation, and point cloud processing.
|
|
334
324
|
|
|
335
325
|
Attributes:
|
|
336
326
|
depth (int): Number of layers in the transformer.
|
|
@@ -362,11 +352,10 @@ class SAM2TwoWayTransformer(TwoWayTransformer):
|
|
|
362
352
|
activation: type[nn.Module] = nn.ReLU,
|
|
363
353
|
attention_downsample_rate: int = 2,
|
|
364
354
|
) -> None:
|
|
365
|
-
"""
|
|
366
|
-
Initialize a SAM2TwoWayTransformer instance.
|
|
355
|
+
"""Initialize a SAM2TwoWayTransformer instance.
|
|
367
356
|
|
|
368
|
-
This transformer decoder attends to an input image using queries with supplied positional embeddings.
|
|
369
|
-
|
|
357
|
+
This transformer decoder attends to an input image using queries with supplied positional embeddings. It is
|
|
358
|
+
designed for tasks like object detection, image segmentation, and point cloud processing.
|
|
370
359
|
|
|
371
360
|
Args:
|
|
372
361
|
depth (int): Number of layers in the transformer.
|
|
@@ -403,11 +392,10 @@ class SAM2TwoWayTransformer(TwoWayTransformer):
|
|
|
403
392
|
|
|
404
393
|
|
|
405
394
|
class RoPEAttention(Attention):
|
|
406
|
-
"""
|
|
407
|
-
Implements rotary position encoding for attention mechanisms in transformer architectures.
|
|
395
|
+
"""Implements rotary position encoding for attention mechanisms in transformer architectures.
|
|
408
396
|
|
|
409
|
-
This class extends the base Attention class by incorporating Rotary Position Encoding (RoPE) to enhance
|
|
410
|
-
|
|
397
|
+
This class extends the base Attention class by incorporating Rotary Position Encoding (RoPE) to enhance the
|
|
398
|
+
positional awareness of the attention mechanism.
|
|
411
399
|
|
|
412
400
|
Attributes:
|
|
413
401
|
compute_cis (Callable): Function to compute axial complex numbers for rotary encoding.
|
|
@@ -501,12 +489,11 @@ def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.T
|
|
|
501
489
|
|
|
502
490
|
|
|
503
491
|
class MultiScaleAttention(nn.Module):
|
|
504
|
-
"""
|
|
505
|
-
Implements multiscale self-attention with optional query pooling for efficient feature extraction.
|
|
492
|
+
"""Implements multiscale self-attention with optional query pooling for efficient feature extraction.
|
|
506
493
|
|
|
507
|
-
This class provides a flexible implementation of multiscale attention, allowing for optional
|
|
508
|
-
|
|
509
|
-
|
|
494
|
+
This class provides a flexible implementation of multiscale attention, allowing for optional downsampling of query
|
|
495
|
+
features through pooling. It's designed to enhance the model's ability to capture multiscale information in visual
|
|
496
|
+
tasks.
|
|
510
497
|
|
|
511
498
|
Attributes:
|
|
512
499
|
dim (int): Input dimension of the feature map.
|
|
@@ -581,11 +568,10 @@ class MultiScaleAttention(nn.Module):
|
|
|
581
568
|
|
|
582
569
|
|
|
583
570
|
class MultiScaleBlock(nn.Module):
|
|
584
|
-
"""
|
|
585
|
-
A multiscale attention block with window partitioning and query pooling for efficient vision transformers.
|
|
571
|
+
"""A multiscale attention block with window partitioning and query pooling for efficient vision transformers.
|
|
586
572
|
|
|
587
|
-
This class implements a multiscale attention mechanism with optional window partitioning and downsampling,
|
|
588
|
-
|
|
573
|
+
This class implements a multiscale attention mechanism with optional window partitioning and downsampling, designed
|
|
574
|
+
for use in vision transformer architectures.
|
|
589
575
|
|
|
590
576
|
Attributes:
|
|
591
577
|
dim (int): Input dimension of the block.
|
|
@@ -696,11 +682,10 @@ class MultiScaleBlock(nn.Module):
|
|
|
696
682
|
|
|
697
683
|
|
|
698
684
|
class PositionEmbeddingSine(nn.Module):
|
|
699
|
-
"""
|
|
700
|
-
A module for generating sinusoidal positional embeddings for 2D inputs like images.
|
|
685
|
+
"""A module for generating sinusoidal positional embeddings for 2D inputs like images.
|
|
701
686
|
|
|
702
|
-
This class implements sinusoidal position encoding for 2D spatial positions, which can be used in
|
|
703
|
-
|
|
687
|
+
This class implements sinusoidal position encoding for 2D spatial positions, which can be used in transformer-based
|
|
688
|
+
models for computer vision tasks.
|
|
704
689
|
|
|
705
690
|
Attributes:
|
|
706
691
|
num_pos_feats (int): Number of positional features (half of the embedding dimension).
|
|
@@ -811,8 +796,7 @@ class PositionEmbeddingSine(nn.Module):
|
|
|
811
796
|
|
|
812
797
|
|
|
813
798
|
class PositionEmbeddingRandom(nn.Module):
|
|
814
|
-
"""
|
|
815
|
-
Positional encoding using random spatial frequencies.
|
|
799
|
+
"""Positional encoding using random spatial frequencies.
|
|
816
800
|
|
|
817
801
|
This class generates positional embeddings for input coordinates using random spatial frequencies. It is
|
|
818
802
|
particularly useful for transformer-based models that require position information.
|
|
@@ -878,12 +862,11 @@ class PositionEmbeddingRandom(nn.Module):
|
|
|
878
862
|
|
|
879
863
|
|
|
880
864
|
class Block(nn.Module):
|
|
881
|
-
"""
|
|
882
|
-
Transformer block with support for window attention and residual propagation.
|
|
865
|
+
"""Transformer block with support for window attention and residual propagation.
|
|
883
866
|
|
|
884
|
-
This class implements a transformer block that can use either global or windowed self-attention,
|
|
885
|
-
|
|
886
|
-
|
|
867
|
+
This class implements a transformer block that can use either global or windowed self-attention, followed by a
|
|
868
|
+
feed-forward network. It supports relative positional embeddings and is designed for use in vision transformer
|
|
869
|
+
architectures.
|
|
887
870
|
|
|
888
871
|
Attributes:
|
|
889
872
|
norm1 (nn.Module): First normalization layer.
|
|
@@ -917,12 +900,11 @@ class Block(nn.Module):
|
|
|
917
900
|
window_size: int = 0,
|
|
918
901
|
input_size: tuple[int, int] | None = None,
|
|
919
902
|
) -> None:
|
|
920
|
-
"""
|
|
921
|
-
Initialize a transformer block with optional window attention and relative positional embeddings.
|
|
903
|
+
"""Initialize a transformer block with optional window attention and relative positional embeddings.
|
|
922
904
|
|
|
923
|
-
This constructor sets up a transformer block that can use either global or windowed self-attention,
|
|
924
|
-
|
|
925
|
-
|
|
905
|
+
This constructor sets up a transformer block that can use either global or windowed self-attention, followed by
|
|
906
|
+
a feed-forward network. It supports relative positional embeddings and is designed for use in vision transformer
|
|
907
|
+
architectures.
|
|
926
908
|
|
|
927
909
|
Args:
|
|
928
910
|
dim (int): Number of input channels.
|
|
@@ -978,12 +960,11 @@ class Block(nn.Module):
|
|
|
978
960
|
|
|
979
961
|
|
|
980
962
|
class REAttention(nn.Module):
|
|
981
|
-
"""
|
|
982
|
-
Relative Position Attention module for efficient self-attention in transformer architectures.
|
|
963
|
+
"""Relative Position Attention module for efficient self-attention in transformer architectures.
|
|
983
964
|
|
|
984
|
-
This class implements a multi-head attention mechanism with relative positional embeddings, designed
|
|
985
|
-
|
|
986
|
-
|
|
965
|
+
This class implements a multi-head attention mechanism with relative positional embeddings, designed for use in
|
|
966
|
+
vision transformer models. It supports optional query pooling and window partitioning for efficient processing of
|
|
967
|
+
large inputs.
|
|
987
968
|
|
|
988
969
|
Attributes:
|
|
989
970
|
num_heads (int): Number of attention heads.
|
|
@@ -1014,11 +995,10 @@ class REAttention(nn.Module):
|
|
|
1014
995
|
rel_pos_zero_init: bool = True,
|
|
1015
996
|
input_size: tuple[int, int] | None = None,
|
|
1016
997
|
) -> None:
|
|
1017
|
-
"""
|
|
1018
|
-
Initialize a Relative Position Attention module for transformer-based architectures.
|
|
998
|
+
"""Initialize a Relative Position Attention module for transformer-based architectures.
|
|
1019
999
|
|
|
1020
|
-
This module implements multi-head attention with optional relative positional encodings, designed
|
|
1021
|
-
|
|
1000
|
+
This module implements multi-head attention with optional relative positional encodings, designed specifically
|
|
1001
|
+
for vision tasks in transformer models.
|
|
1022
1002
|
|
|
1023
1003
|
Args:
|
|
1024
1004
|
dim (int): Number of input channels.
|
|
@@ -1070,12 +1050,11 @@ class REAttention(nn.Module):
|
|
|
1070
1050
|
|
|
1071
1051
|
|
|
1072
1052
|
class PatchEmbed(nn.Module):
|
|
1073
|
-
"""
|
|
1074
|
-
Image to Patch Embedding module for vision transformer architectures.
|
|
1053
|
+
"""Image to Patch Embedding module for vision transformer architectures.
|
|
1075
1054
|
|
|
1076
|
-
This module converts an input image into a sequence of patch embeddings using a convolutional layer.
|
|
1077
|
-
|
|
1078
|
-
|
|
1055
|
+
This module converts an input image into a sequence of patch embeddings using a convolutional layer. It is commonly
|
|
1056
|
+
used as the first layer in vision transformer architectures to transform image data into a suitable format for
|
|
1057
|
+
subsequent transformer blocks.
|
|
1079
1058
|
|
|
1080
1059
|
Attributes:
|
|
1081
1060
|
proj (nn.Conv2d): Convolutional layer for projecting image patches to embeddings.
|
|
@@ -1099,11 +1078,10 @@ class PatchEmbed(nn.Module):
|
|
|
1099
1078
|
in_chans: int = 3,
|
|
1100
1079
|
embed_dim: int = 768,
|
|
1101
1080
|
) -> None:
|
|
1102
|
-
"""
|
|
1103
|
-
Initialize the PatchEmbed module for converting image patches to embeddings.
|
|
1081
|
+
"""Initialize the PatchEmbed module for converting image patches to embeddings.
|
|
1104
1082
|
|
|
1105
|
-
This module is typically used as the first layer in vision transformer architectures to transform
|
|
1106
|
-
|
|
1083
|
+
This module is typically used as the first layer in vision transformer architectures to transform image data
|
|
1084
|
+
into a suitable format for subsequent transformer blocks.
|
|
1107
1085
|
|
|
1108
1086
|
Args:
|
|
1109
1087
|
kernel_size (tuple[int, int]): Size of the convolutional kernel for patch extraction.
|
|
@@ -9,8 +9,7 @@ from ultralytics.nn.modules import MLP, LayerNorm2d
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class MaskDecoder(nn.Module):
|
|
12
|
-
"""
|
|
13
|
-
Decoder module for generating masks and their associated quality scores using a transformer architecture.
|
|
12
|
+
"""Decoder module for generating masks and their associated quality scores using a transformer architecture.
|
|
14
13
|
|
|
15
14
|
This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
|
|
16
15
|
generate mask predictions along with their quality scores.
|
|
@@ -47,8 +46,7 @@ class MaskDecoder(nn.Module):
|
|
|
47
46
|
iou_head_depth: int = 3,
|
|
48
47
|
iou_head_hidden_dim: int = 256,
|
|
49
48
|
) -> None:
|
|
50
|
-
"""
|
|
51
|
-
Initialize the MaskDecoder module for generating masks and their associated quality scores.
|
|
49
|
+
"""Initialize the MaskDecoder module for generating masks and their associated quality scores.
|
|
52
50
|
|
|
53
51
|
Args:
|
|
54
52
|
transformer_dim (int): Channel dimension for the transformer module.
|
|
@@ -94,8 +92,7 @@ class MaskDecoder(nn.Module):
|
|
|
94
92
|
dense_prompt_embeddings: torch.Tensor,
|
|
95
93
|
multimask_output: bool,
|
|
96
94
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
97
|
-
"""
|
|
98
|
-
Predict masks given image and prompt embeddings.
|
|
95
|
+
"""Predict masks given image and prompt embeddings.
|
|
99
96
|
|
|
100
97
|
Args:
|
|
101
98
|
image_embeddings (torch.Tensor): Embeddings from the image encoder.
|
|
@@ -172,11 +169,10 @@ class MaskDecoder(nn.Module):
|
|
|
172
169
|
|
|
173
170
|
|
|
174
171
|
class SAM2MaskDecoder(nn.Module):
|
|
175
|
-
"""
|
|
176
|
-
Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.
|
|
172
|
+
"""Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.
|
|
177
173
|
|
|
178
|
-
This class extends the functionality of the MaskDecoder, incorporating additional features such as
|
|
179
|
-
|
|
174
|
+
This class extends the functionality of the MaskDecoder, incorporating additional features such as high-resolution
|
|
175
|
+
feature processing, dynamic multimask output, and object score prediction.
|
|
180
176
|
|
|
181
177
|
Attributes:
|
|
182
178
|
transformer_dim (int): Channel dimension of the transformer.
|
|
@@ -233,11 +229,10 @@ class SAM2MaskDecoder(nn.Module):
|
|
|
233
229
|
pred_obj_scores_mlp: bool = False,
|
|
234
230
|
use_multimask_token_for_obj_ptr: bool = False,
|
|
235
231
|
) -> None:
|
|
236
|
-
"""
|
|
237
|
-
Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.
|
|
232
|
+
"""Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.
|
|
238
233
|
|
|
239
|
-
This decoder extends the functionality of MaskDecoder, incorporating additional features such as
|
|
240
|
-
|
|
234
|
+
This decoder extends the functionality of MaskDecoder, incorporating additional features such as high-resolution
|
|
235
|
+
feature processing, dynamic multimask output, and object score prediction.
|
|
241
236
|
|
|
242
237
|
Args:
|
|
243
238
|
transformer_dim (int): Channel dimension of the transformer.
|
|
@@ -319,8 +314,7 @@ class SAM2MaskDecoder(nn.Module):
|
|
|
319
314
|
repeat_image: bool,
|
|
320
315
|
high_res_features: list[torch.Tensor] | None = None,
|
|
321
316
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
322
|
-
"""
|
|
323
|
-
Predict masks given image and prompt embeddings.
|
|
317
|
+
"""Predict masks given image and prompt embeddings.
|
|
324
318
|
|
|
325
319
|
Args:
|
|
326
320
|
image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
|
|
@@ -458,17 +452,16 @@ class SAM2MaskDecoder(nn.Module):
|
|
|
458
452
|
return torch.where(area_u > 0, area_i / area_u, 1.0)
|
|
459
453
|
|
|
460
454
|
def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
|
|
461
|
-
"""
|
|
462
|
-
Dynamically select the most stable mask output based on stability scores and IoU predictions.
|
|
455
|
+
"""Dynamically select the most stable mask output based on stability scores and IoU predictions.
|
|
463
456
|
|
|
464
|
-
This method is used when outputting a single mask. If the stability score from the current single-mask
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
457
|
+
This method is used when outputting a single mask. If the stability score from the current single-mask output
|
|
458
|
+
(based on output token 0) falls below a threshold, it instead selects from multi-mask outputs (based on output
|
|
459
|
+
tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask for both clicking and
|
|
460
|
+
tracking scenarios.
|
|
468
461
|
|
|
469
462
|
Args:
|
|
470
|
-
all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
|
|
471
|
-
|
|
463
|
+
all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is batch size, N
|
|
464
|
+
is number of masks (typically 4), and H, W are mask dimensions.
|
|
472
465
|
all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).
|
|
473
466
|
|
|
474
467
|
Returns:
|
|
@@ -21,8 +21,7 @@ from .blocks import (
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class ImageEncoderViT(nn.Module):
|
|
24
|
-
"""
|
|
25
|
-
An image encoder using Vision Transformer (ViT) architecture for encoding images into a compact latent space.
|
|
24
|
+
"""An image encoder using Vision Transformer (ViT) architecture for encoding images into a compact latent space.
|
|
26
25
|
|
|
27
26
|
This class processes images by splitting them into patches, applying transformer blocks, and generating a final
|
|
28
27
|
encoded representation through a neck module.
|
|
@@ -64,8 +63,7 @@ class ImageEncoderViT(nn.Module):
|
|
|
64
63
|
window_size: int = 0,
|
|
65
64
|
global_attn_indexes: tuple[int, ...] = (),
|
|
66
65
|
) -> None:
|
|
67
|
-
"""
|
|
68
|
-
Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
|
|
66
|
+
"""Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
|
|
69
67
|
|
|
70
68
|
Args:
|
|
71
69
|
img_size (int): Input image size, assumed to be square.
|
|
@@ -156,8 +154,7 @@ class ImageEncoderViT(nn.Module):
|
|
|
156
154
|
|
|
157
155
|
|
|
158
156
|
class PromptEncoder(nn.Module):
|
|
159
|
-
"""
|
|
160
|
-
Encode different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
|
|
157
|
+
"""Encode different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
|
|
161
158
|
|
|
162
159
|
Attributes:
|
|
163
160
|
embed_dim (int): Dimension of the embeddings.
|
|
@@ -193,8 +190,7 @@ class PromptEncoder(nn.Module):
|
|
|
193
190
|
mask_in_chans: int,
|
|
194
191
|
activation: type[nn.Module] = nn.GELU,
|
|
195
192
|
) -> None:
|
|
196
|
-
"""
|
|
197
|
-
Initialize the PromptEncoder module for encoding various types of prompts.
|
|
193
|
+
"""Initialize the PromptEncoder module for encoding various types of prompts.
|
|
198
194
|
|
|
199
195
|
Args:
|
|
200
196
|
embed_dim (int): The dimension of the embeddings.
|
|
@@ -236,15 +232,14 @@ class PromptEncoder(nn.Module):
|
|
|
236
232
|
self.no_mask_embed = nn.Embedding(1, embed_dim)
|
|
237
233
|
|
|
238
234
|
def get_dense_pe(self) -> torch.Tensor:
|
|
239
|
-
"""
|
|
240
|
-
Return the dense positional encoding used for encoding point prompts.
|
|
235
|
+
"""Return the dense positional encoding used for encoding point prompts.
|
|
241
236
|
|
|
242
237
|
Generate a positional encoding for a dense set of points matching the shape of the image
|
|
243
238
|
encoding. The encoding is used to provide spatial information to the model when processing point prompts.
|
|
244
239
|
|
|
245
240
|
Returns:
|
|
246
|
-
(torch.Tensor): Positional encoding tensor with shape (1, embed_dim, H, W), where H and W are the
|
|
247
|
-
|
|
241
|
+
(torch.Tensor): Positional encoding tensor with shape (1, embed_dim, H, W), where H and W are the height and
|
|
242
|
+
width of the image embedding size, respectively.
|
|
248
243
|
|
|
249
244
|
Examples:
|
|
250
245
|
>>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
|
|
@@ -306,13 +301,11 @@ class PromptEncoder(nn.Module):
|
|
|
306
301
|
boxes: torch.Tensor | None,
|
|
307
302
|
masks: torch.Tensor | None,
|
|
308
303
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
309
|
-
"""
|
|
310
|
-
Embed different types of prompts, returning both sparse and dense embeddings.
|
|
304
|
+
"""Embed different types of prompts, returning both sparse and dense embeddings.
|
|
311
305
|
|
|
312
306
|
Args:
|
|
313
|
-
points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
|
|
314
|
-
|
|
315
|
-
shape (B, N).
|
|
307
|
+
points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first tensor
|
|
308
|
+
contains coordinates of shape (B, N, 2), and the second tensor contains labels of shape (B, N).
|
|
316
309
|
boxes (torch.Tensor | None): Boxes to embed with shape (B, M, 2, 2), where M is the number of boxes.
|
|
317
310
|
masks (torch.Tensor | None): Masks to embed with shape (B, 1, H, W).
|
|
318
311
|
|
|
@@ -354,11 +347,10 @@ class PromptEncoder(nn.Module):
|
|
|
354
347
|
|
|
355
348
|
|
|
356
349
|
class MemoryEncoder(nn.Module):
|
|
357
|
-
"""
|
|
358
|
-
Encode pixel features and masks into a memory representation for efficient image segmentation.
|
|
350
|
+
"""Encode pixel features and masks into a memory representation for efficient image segmentation.
|
|
359
351
|
|
|
360
|
-
This class processes pixel-level features and masks, fusing them to generate encoded memory representations
|
|
361
|
-
|
|
352
|
+
This class processes pixel-level features and masks, fusing them to generate encoded memory representations suitable
|
|
353
|
+
for downstream tasks in image segmentation models like SAM (Segment Anything Model).
|
|
362
354
|
|
|
363
355
|
Attributes:
|
|
364
356
|
mask_downsampler (MaskDownSampler): Module for downsampling input masks.
|
|
@@ -385,8 +377,7 @@ class MemoryEncoder(nn.Module):
|
|
|
385
377
|
out_dim,
|
|
386
378
|
in_dim=256, # in_dim of pix_feats
|
|
387
379
|
):
|
|
388
|
-
"""
|
|
389
|
-
Initialize the MemoryEncoder for encoding pixel features and masks into memory representations.
|
|
380
|
+
"""Initialize the MemoryEncoder for encoding pixel features and masks into memory representations.
|
|
390
381
|
|
|
391
382
|
This encoder processes pixel-level features and masks, fusing them to generate encoded memory representations
|
|
392
383
|
suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
|
|
@@ -439,11 +430,10 @@ class MemoryEncoder(nn.Module):
|
|
|
439
430
|
|
|
440
431
|
|
|
441
432
|
class ImageEncoder(nn.Module):
|
|
442
|
-
"""
|
|
443
|
-
Encode images using a trunk-neck architecture, producing multiscale features and positional encodings.
|
|
433
|
+
"""Encode images using a trunk-neck architecture, producing multiscale features and positional encodings.
|
|
444
434
|
|
|
445
|
-
This class combines a trunk network for feature extraction with a neck network for feature refinement
|
|
446
|
-
|
|
435
|
+
This class combines a trunk network for feature extraction with a neck network for feature refinement and positional
|
|
436
|
+
encoding generation. It can optionally discard the lowest resolution features.
|
|
447
437
|
|
|
448
438
|
Attributes:
|
|
449
439
|
trunk (nn.Module): The trunk network for initial feature extraction.
|
|
@@ -469,11 +459,10 @@ class ImageEncoder(nn.Module):
|
|
|
469
459
|
neck: nn.Module,
|
|
470
460
|
scalp: int = 0,
|
|
471
461
|
):
|
|
472
|
-
"""
|
|
473
|
-
Initialize the ImageEncoder with trunk and neck networks for feature extraction and refinement.
|
|
462
|
+
"""Initialize the ImageEncoder with trunk and neck networks for feature extraction and refinement.
|
|
474
463
|
|
|
475
|
-
This encoder combines a trunk network for feature extraction with a neck network for feature refinement
|
|
476
|
-
|
|
464
|
+
This encoder combines a trunk network for feature extraction with a neck network for feature refinement and
|
|
465
|
+
positional encoding generation. It can optionally discard the lowest resolution features.
|
|
477
466
|
|
|
478
467
|
Args:
|
|
479
468
|
trunk (nn.Module): The trunk network for initial feature extraction.
|
|
@@ -513,11 +502,10 @@ class ImageEncoder(nn.Module):
|
|
|
513
502
|
|
|
514
503
|
|
|
515
504
|
class FpnNeck(nn.Module):
|
|
516
|
-
"""
|
|
517
|
-
A Feature Pyramid Network (FPN) neck variant for multiscale feature fusion in object detection models.
|
|
505
|
+
"""A Feature Pyramid Network (FPN) neck variant for multiscale feature fusion in object detection models.
|
|
518
506
|
|
|
519
|
-
This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
|
|
520
|
-
|
|
507
|
+
This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing, similar to ViT
|
|
508
|
+
positional embedding interpolation.
|
|
521
509
|
|
|
522
510
|
Attributes:
|
|
523
511
|
position_encoding (PositionEmbeddingSine): Sinusoidal positional encoding module.
|
|
@@ -550,11 +538,10 @@ class FpnNeck(nn.Module):
|
|
|
550
538
|
fuse_type: str = "sum",
|
|
551
539
|
fpn_top_down_levels: list[int] | None = None,
|
|
552
540
|
):
|
|
553
|
-
"""
|
|
554
|
-
Initialize a modified Feature Pyramid Network (FPN) neck.
|
|
541
|
+
"""Initialize a modified Feature Pyramid Network (FPN) neck.
|
|
555
542
|
|
|
556
|
-
This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
|
|
557
|
-
|
|
543
|
+
This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing, similar to
|
|
544
|
+
ViT positional embedding interpolation.
|
|
558
545
|
|
|
559
546
|
Args:
|
|
560
547
|
d_model (int): Dimension of the model.
|
|
@@ -603,8 +590,7 @@ class FpnNeck(nn.Module):
|
|
|
603
590
|
self.fpn_top_down_levels = list(fpn_top_down_levels)
|
|
604
591
|
|
|
605
592
|
def forward(self, xs: list[torch.Tensor]):
|
|
606
|
-
"""
|
|
607
|
-
Perform forward pass through the Feature Pyramid Network (FPN) neck.
|
|
593
|
+
"""Perform forward pass through the Feature Pyramid Network (FPN) neck.
|
|
608
594
|
|
|
609
595
|
This method processes a list of input tensors from the backbone through the FPN, applying lateral connections
|
|
610
596
|
and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
|
|
@@ -613,8 +599,8 @@ class FpnNeck(nn.Module):
|
|
|
613
599
|
xs (list[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
|
|
614
600
|
|
|
615
601
|
Returns:
|
|
616
|
-
out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape
|
|
617
|
-
|
|
602
|
+
out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape (B, d_model, H,
|
|
603
|
+
W).
|
|
618
604
|
pos (list[torch.Tensor]): List of positional encodings corresponding to each output feature map.
|
|
619
605
|
|
|
620
606
|
Examples:
|
|
@@ -656,12 +642,11 @@ class FpnNeck(nn.Module):
|
|
|
656
642
|
|
|
657
643
|
|
|
658
644
|
class Hiera(nn.Module):
|
|
659
|
-
"""
|
|
660
|
-
Hierarchical vision transformer for efficient multiscale feature extraction in image processing tasks.
|
|
645
|
+
"""Hierarchical vision transformer for efficient multiscale feature extraction in image processing tasks.
|
|
661
646
|
|
|
662
|
-
This class implements a Hiera model, which is a hierarchical vision transformer architecture designed for
|
|
663
|
-
|
|
664
|
-
|
|
647
|
+
This class implements a Hiera model, which is a hierarchical vision transformer architecture designed for efficient
|
|
648
|
+
multiscale feature extraction. It uses a series of transformer blocks organized into stages, with optional pooling
|
|
649
|
+
and global attention mechanisms.
|
|
665
650
|
|
|
666
651
|
Attributes:
|
|
667
652
|
window_spec (tuple[int, ...]): Window sizes for each stage.
|
|
@@ -715,12 +700,11 @@ class Hiera(nn.Module):
|
|
|
715
700
|
),
|
|
716
701
|
return_interm_layers=True, # return feats from every stage
|
|
717
702
|
):
|
|
718
|
-
"""
|
|
719
|
-
Initialize a Hiera model, a hierarchical vision transformer for efficient multiscale feature extraction.
|
|
703
|
+
"""Initialize a Hiera model, a hierarchical vision transformer for efficient multiscale feature extraction.
|
|
720
704
|
|
|
721
|
-
Hiera is a hierarchical vision transformer architecture designed for efficient multiscale feature extraction
|
|
722
|
-
|
|
723
|
-
|
|
705
|
+
Hiera is a hierarchical vision transformer architecture designed for efficient multiscale feature extraction in
|
|
706
|
+
image processing tasks. It uses a series of transformer blocks organized into stages, with optional pooling and
|
|
707
|
+
global attention mechanisms.
|
|
724
708
|
|
|
725
709
|
Args:
|
|
726
710
|
embed_dim (int): Initial embedding dimension for the model.
|
|
@@ -731,7 +715,8 @@ class Hiera(nn.Module):
|
|
|
731
715
|
stages (tuple[int, ...]): Number of blocks per stage.
|
|
732
716
|
dim_mul (float): Dimension multiplier factor at stage transitions.
|
|
733
717
|
head_mul (float): Head multiplier factor at stage transitions.
|
|
734
|
-
window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding
|
|
718
|
+
window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding
|
|
719
|
+
background.
|
|
735
720
|
window_spec (tuple[int, ...]): Window sizes for each stage when not using global attention.
|
|
736
721
|
global_att_blocks (tuple[int, ...]): Indices of blocks that use global attention.
|
|
737
722
|
return_interm_layers (bool): Whether to return intermediate layer outputs.
|
|
@@ -816,8 +801,7 @@ class Hiera(nn.Module):
|
|
|
816
801
|
return pos_embed
|
|
817
802
|
|
|
818
803
|
def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
|
|
819
|
-
"""
|
|
820
|
-
Perform forward pass through Hiera model, extracting multiscale features from input images.
|
|
804
|
+
"""Perform forward pass through Hiera model, extracting multiscale features from input images.
|
|
821
805
|
|
|
822
806
|
Args:
|
|
823
807
|
x (torch.Tensor): Input tensor with shape (B, C, H, W) representing a batch of images.
|