PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.141__py3-none-any.whl → 8.3.144__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.141py3-none-any.whl → 8.3.144py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

{dgenerate_ultralytics_headless-8.3.141.dist-info → dgenerate_ultralytics_headless-8.3.144.dist-info}/METADATA +1 -1
dgenerate_ultralytics_headless-8.3.144.dist-info/RECORD +272 -0
tests/conftest.py +7 -24
tests/test_cli.py +1 -1
tests/test_cuda.py +7 -2
tests/test_engine.py +7 -8
tests/test_exports.py +16 -16
tests/test_integrations.py +1 -1
tests/test_solutions.py +12 -12
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +22 -19
ultralytics/data/annotator.py +6 -5
ultralytics/data/augment.py +127 -126
ultralytics/data/base.py +54 -51
ultralytics/data/build.py +47 -23
ultralytics/data/converter.py +47 -43
ultralytics/data/dataset.py +51 -50
ultralytics/data/loaders.py +77 -44
ultralytics/data/split.py +22 -9
ultralytics/data/split_dota.py +63 -39
ultralytics/data/utils.py +59 -39
ultralytics/engine/exporter.py +79 -27
ultralytics/engine/model.py +39 -39
ultralytics/engine/predictor.py +37 -28
ultralytics/engine/results.py +187 -158
ultralytics/engine/trainer.py +36 -19
ultralytics/engine/tuner.py +12 -9
ultralytics/engine/validator.py +7 -9
ultralytics/hub/__init__.py +11 -13
ultralytics/hub/auth.py +22 -2
ultralytics/hub/google/__init__.py +19 -19
ultralytics/hub/session.py +37 -51
ultralytics/hub/utils.py +19 -5
ultralytics/models/fastsam/model.py +30 -12
ultralytics/models/fastsam/predict.py +5 -6
ultralytics/models/fastsam/utils.py +3 -3
ultralytics/models/fastsam/val.py +10 -6
ultralytics/models/nas/model.py +9 -5
ultralytics/models/nas/predict.py +6 -6
ultralytics/models/nas/val.py +3 -3
ultralytics/models/rtdetr/model.py +7 -6
ultralytics/models/rtdetr/predict.py +14 -7
ultralytics/models/rtdetr/train.py +10 -4
ultralytics/models/rtdetr/val.py +36 -9
ultralytics/models/sam/amg.py +30 -12
ultralytics/models/sam/build.py +22 -22
ultralytics/models/sam/model.py +10 -9
ultralytics/models/sam/modules/blocks.py +76 -80
ultralytics/models/sam/modules/decoders.py +6 -8
ultralytics/models/sam/modules/encoders.py +23 -26
ultralytics/models/sam/modules/memory_attention.py +13 -1
ultralytics/models/sam/modules/sam.py +57 -26
ultralytics/models/sam/modules/tiny_encoder.py +232 -237
ultralytics/models/sam/modules/transformer.py +13 -13
ultralytics/models/sam/modules/utils.py +11 -19
ultralytics/models/sam/predict.py +114 -101
ultralytics/models/utils/loss.py +98 -77
ultralytics/models/utils/ops.py +116 -67
ultralytics/models/yolo/classify/predict.py +5 -5
ultralytics/models/yolo/classify/train.py +32 -28
ultralytics/models/yolo/classify/val.py +7 -8
ultralytics/models/yolo/detect/predict.py +1 -0
ultralytics/models/yolo/detect/train.py +15 -14
ultralytics/models/yolo/detect/val.py +37 -36
ultralytics/models/yolo/model.py +106 -23
ultralytics/models/yolo/obb/predict.py +3 -4
ultralytics/models/yolo/obb/train.py +14 -6
ultralytics/models/yolo/obb/val.py +29 -23
ultralytics/models/yolo/pose/predict.py +9 -8
ultralytics/models/yolo/pose/train.py +24 -16
ultralytics/models/yolo/pose/val.py +44 -26
ultralytics/models/yolo/segment/predict.py +5 -5
ultralytics/models/yolo/segment/train.py +11 -7
ultralytics/models/yolo/segment/val.py +2 -2
ultralytics/models/yolo/world/train.py +33 -23
ultralytics/models/yolo/world/train_world.py +11 -3
ultralytics/models/yolo/yoloe/predict.py +11 -11
ultralytics/models/yolo/yoloe/train.py +73 -21
ultralytics/models/yolo/yoloe/train_seg.py +10 -7
ultralytics/models/yolo/yoloe/val.py +42 -18
ultralytics/nn/autobackend.py +59 -15
ultralytics/nn/modules/__init__.py +4 -4
ultralytics/nn/modules/activation.py +4 -1
ultralytics/nn/modules/block.py +178 -111
ultralytics/nn/modules/conv.py +6 -5
ultralytics/nn/modules/head.py +469 -121
ultralytics/nn/modules/transformer.py +147 -58
ultralytics/nn/tasks.py +227 -20
ultralytics/nn/text_model.py +30 -33
ultralytics/solutions/ai_gym.py +1 -1
ultralytics/solutions/analytics.py +7 -4
ultralytics/solutions/config.py +10 -10
ultralytics/solutions/distance_calculation.py +13 -11
ultralytics/solutions/heatmap.py +1 -1
ultralytics/solutions/instance_segmentation.py +6 -3
ultralytics/solutions/object_blurrer.py +3 -3
ultralytics/solutions/object_counter.py +18 -12
ultralytics/solutions/object_cropper.py +12 -5
ultralytics/solutions/parking_management.py +29 -28
ultralytics/solutions/queue_management.py +6 -6
ultralytics/solutions/region_counter.py +10 -3
ultralytics/solutions/security_alarm.py +3 -3
ultralytics/solutions/similarity_search.py +85 -24
ultralytics/solutions/solutions.py +215 -85
ultralytics/solutions/speed_estimation.py +28 -22
ultralytics/solutions/streamlit_inference.py +17 -12
ultralytics/solutions/trackzone.py +4 -4
ultralytics/trackers/basetrack.py +16 -23
ultralytics/trackers/bot_sort.py +30 -20
ultralytics/trackers/byte_tracker.py +70 -64
ultralytics/trackers/track.py +4 -8
ultralytics/trackers/utils/gmc.py +31 -58
ultralytics/trackers/utils/kalman_filter.py +37 -37
ultralytics/trackers/utils/matching.py +1 -1
ultralytics/utils/__init__.py +105 -89
ultralytics/utils/autobatch.py +16 -3
ultralytics/utils/autodevice.py +54 -24
ultralytics/utils/benchmarks.py +42 -28
ultralytics/utils/callbacks/base.py +3 -3
ultralytics/utils/callbacks/clearml.py +9 -9
ultralytics/utils/callbacks/comet.py +67 -25
ultralytics/utils/callbacks/dvc.py +7 -10
ultralytics/utils/callbacks/mlflow.py +2 -5
ultralytics/utils/callbacks/neptune.py +7 -13
ultralytics/utils/callbacks/raytune.py +1 -1
ultralytics/utils/callbacks/tensorboard.py +5 -6
ultralytics/utils/callbacks/wb.py +14 -14
ultralytics/utils/checks.py +14 -13
ultralytics/utils/dist.py +5 -5
ultralytics/utils/downloads.py +94 -67
ultralytics/utils/errors.py +5 -5
ultralytics/utils/export.py +61 -47
ultralytics/utils/files.py +23 -22
ultralytics/utils/instance.py +48 -52
ultralytics/utils/loss.py +78 -40
ultralytics/utils/metrics.py +186 -130
ultralytics/utils/ops.py +186 -190
ultralytics/utils/patches.py +15 -17
ultralytics/utils/plotting.py +84 -42
ultralytics/utils/tal.py +21 -15
ultralytics/utils/torch_utils.py +53 -50
ultralytics/utils/triton.py +5 -4
ultralytics/utils/tuner.py +5 -5
dgenerate_ultralytics_headless-8.3.141.dist-info/RECORD +0 -272
{dgenerate_ultralytics_headless-8.3.141.dist-info → dgenerate_ultralytics_headless-8.3.144.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.141.dist-info → dgenerate_ultralytics_headless-8.3.144.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.141.dist-info → dgenerate_ultralytics_headless-8.3.144.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.141.dist-info → dgenerate_ultralytics_headless-8.3.144.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/blocks.py CHANGED Viewed

@@ -33,14 +33,14 @@ class DropPath(nn.Module):
         >>> output = drop_path(x)
     """
-    def __init__(self, drop_prob=0.0, scale_by_keep=True):
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
         """Initialize DropPath module for stochastic depth regularization during training."""
         super().__init__()
         self.drop_prob = drop_prob
         self.scale_by_keep = scale_by_keep
-    def forward(self, x):
-        """Applies stochastic depth to input tensor during training, with optional scaling."""
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply stochastic depth to input tensor during training, with optional scaling."""
         if self.drop_prob == 0.0 or not self.training:
             return x
         keep_prob = 1 - self.drop_prob
@@ -76,14 +76,14 @@ class MaskDownSampler(nn.Module):
     def __init__(
         self,
-        embed_dim=256,
-        kernel_size=4,
-        stride=4,
-        padding=0,
-        total_stride=16,
-        activation=nn.GELU,
+        embed_dim: int = 256,
+        kernel_size: int = 4,
+        stride: int = 4,
+        padding: int = 0,
+        total_stride: int = 16,
+        activation: Type[nn.Module] = nn.GELU,
     ):
-        """Initializes a mask downsampler module for progressive downsampling and channel expansion."""
+        """Initialize a mask downsampler module for progressive downsampling and channel expansion."""
         super().__init__()
         num_layers = int(math.log2(total_stride) // math.log2(stride))
         assert stride**num_layers == total_stride
@@ -106,8 +106,8 @@ class MaskDownSampler(nn.Module):
         self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
-    def forward(self, x):
-        """Downsamples and encodes input mask to embed_dim channels using convolutional layers and LayerNorm2d."""
+    def forward(self, x: Tensor) -> Tensor:
+        """Downsample and encode input mask to embed_dim channels using convolutional layers and LayerNorm2d."""
         return self.encoder(x)
@@ -141,12 +141,12 @@ class CXBlock(nn.Module):
     def __init__(
         self,
-        dim,
-        kernel_size=7,
-        padding=3,
-        drop_path=0.0,
-        layer_scale_init_value=1e-6,
-        use_dwconv=True,
+        dim: int,
+        kernel_size: int = 7,
+        padding: int = 3,
+        drop_path: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        use_dwconv: bool = True,
     ):
         """
         Initialize a ConvNeXt Block for efficient feature extraction in convolutional neural networks.
@@ -188,8 +188,8 @@ class CXBlock(nn.Module):
         )
         self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-    def forward(self, x):
-        """Applies ConvNeXt block operations to input tensor, including convolutions and residual connection."""
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply ConvNeXt block operations to input tensor, including convolutions and residual connection."""
         input = x
         x = self.dwconv(x)
         x = self.norm(x)
@@ -227,9 +227,9 @@ class Fuser(nn.Module):
         torch.Size([1, 256, 32, 32])
     """
-    def __init__(self, layer, num_layers, dim=None, input_projection=False):
+    def __init__(self, layer: nn.Module, num_layers: int, dim: Optional[int] = None, input_projection: bool = False):
         """
-        Initializes the Fuser module for feature fusion through multiple layers.
+        Initialize the Fuser module for feature fusion through multiple layers.
         This module creates a sequence of identical layers and optionally applies an input projection.
@@ -253,8 +253,8 @@ class Fuser(nn.Module):
             assert dim is not None
             self.proj = nn.Conv2d(dim, dim, kernel_size=1)
-    def forward(self, x):
-        """Applies a series of layers to the input tensor, optionally projecting it first."""
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply a series of layers to the input tensor, optionally projecting it first."""
         x = self.proj(x)
         for layer in self.layers:
             x = layer(x)
@@ -300,7 +300,7 @@ class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
         skip_first_layer_pe: bool = False,
     ) -> None:
         """
-        Initializes a SAM2TwoWayAttentionBlock for performing self-attention and cross-attention in two directions.
+        Initialize a SAM2TwoWayAttentionBlock for performing self-attention and cross-attention in two directions.
         This block extends the TwoWayAttentionBlock and consists of four main components: self-attention on sparse
         inputs, cross-attention from sparse to dense inputs, an MLP block on sparse inputs, and cross-attention
@@ -363,7 +363,7 @@ class SAM2TwoWayTransformer(TwoWayTransformer):
         attention_downsample_rate: int = 2,
     ) -> None:
         """
-        Initializes a SAM2TwoWayTransformer instance.
+        Initialize a SAM2TwoWayTransformer instance.
         This transformer decoder attends to an input image using queries with supplied positional embeddings.
         It is designed for tasks like object detection, image segmentation, and point cloud processing.
@@ -430,12 +430,12 @@ class RoPEAttention(Attention):
     def __init__(
         self,
         *args,
-        rope_theta=10000.0,
-        rope_k_repeat=False,
-        feat_sizes=(32, 32),  # [w, h] for stride 16 feats at 512 resolution
+        rope_theta: float = 10000.0,
+        rope_k_repeat: bool = False,
+        feat_sizes: Tuple[int, int] = (32, 32),  # [w, h] for stride 16 feats at 512 resolution
         **kwargs,
     ):
-        """Initializes RoPEAttention with rotary position encoding for enhanced positional awareness."""
+        """Initialize RoPEAttention with rotary position encoding for enhanced positional awareness."""
         super().__init__(*args, **kwargs)
         self.compute_cis = partial(compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta)
@@ -444,7 +444,7 @@ class RoPEAttention(Attention):
         self.rope_k_repeat = rope_k_repeat  # repeat q rope to match k length, needed for cross-attention to memories
     def forward(self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0) -> Tensor:
-        """Applies rotary position encoding and computes attention between query, key, and value tensors."""
+        """Apply rotary position encoding and compute attention between query, key, and value tensors."""
         q = self.q_proj(q)
         k = self.k_proj(k)
         v = self.v_proj(v)
@@ -486,7 +486,7 @@ class RoPEAttention(Attention):
 def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
-    """Applies pooling and optional normalization to a tensor, handling spatial dimension permutations."""
+    """Apply pooling and optional normalization to a tensor, handling spatial dimension permutations."""
     if pool is None:
         return x
     # (B, H, W, C) -> (B, C, H, W)
@@ -537,7 +537,7 @@ class MultiScaleAttention(nn.Module):
         num_heads: int,
         q_pool: nn.Module = None,
     ):
-        """Initializes multiscale attention with optional query pooling for efficient feature extraction."""
+        """Initialize multiscale attention with optional query pooling for efficient feature extraction."""
         super().__init__()
         self.dim = dim
@@ -552,7 +552,7 @@ class MultiScaleAttention(nn.Module):
         self.proj = nn.Linear(dim_out, dim_out)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Applies multiscale attention with optional query pooling to extract multiscale features."""
+        """Apply multiscale attention with optional query pooling to extract multiscale features."""
         B, H, W, _ = x.shape
         # qkv with shape (B, H * W, 3, nHead, C)
         qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
@@ -620,10 +620,10 @@ class MultiScaleBlock(nn.Module):
         drop_path: float = 0.0,
         norm_layer: Union[nn.Module, str] = "LayerNorm",
         q_stride: Tuple[int, int] = None,
-        act_layer: nn.Module = nn.GELU,
+        act_layer: Type[nn.Module] = nn.GELU,
         window_size: int = 0,
     ):
-        """Initializes a multiscale attention block with window partitioning and optional query pooling."""
+        """Initialize a multiscale attention block with window partitioning and optional query pooling."""
         super().__init__()
         if isinstance(norm_layer, str):
@@ -660,7 +660,7 @@ class MultiScaleBlock(nn.Module):
             self.proj = nn.Linear(dim, dim_out)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Processes input through multiscale attention and MLP, with optional windowing and downsampling."""
+        """Process input through multiscale attention and MLP, with optional windowing and downsampling."""
         shortcut = x  # B, H, W, C
         x = self.norm1(x)
@@ -725,12 +725,12 @@ class PositionEmbeddingSine(nn.Module):
     def __init__(
         self,
-        num_pos_feats,
+        num_pos_feats: int,
         temperature: int = 10000,
         normalize: bool = True,
         scale: Optional[float] = None,
     ):
-        """Initializes sinusoidal position embeddings for 2D image inputs."""
+        """Initialize sinusoidal position embeddings for 2D image inputs."""
         super().__init__()
         assert num_pos_feats % 2 == 0, "Expecting even model width"
         self.num_pos_feats = num_pos_feats // 2
@@ -744,8 +744,8 @@ class PositionEmbeddingSine(nn.Module):
         self.cache = {}
-    def _encode_xy(self, x, y):
-        """Encodes 2D positions using sine/cosine functions for transformer positional embeddings."""
+    def _encode_xy(self, x: Tensor, y: Tensor) -> Tuple[Tensor, Tensor]:
+        """Encode 2D positions using sine/cosine functions for transformer positional embeddings."""
         assert len(x) == len(y) and x.ndim == y.ndim == 1
         x_embed = x * self.scale
         y_embed = y * self.scale
@@ -760,16 +760,16 @@ class PositionEmbeddingSine(nn.Module):
         return pos_x, pos_y
     @torch.no_grad()
-    def encode_boxes(self, x, y, w, h):
-        """Encodes box coordinates and dimensions into positional embeddings for detection."""
+    def encode_boxes(self, x: Tensor, y: Tensor, w: Tensor, h: Tensor) -> Tensor:
+        """Encode box coordinates and dimensions into positional embeddings for detection."""
         pos_x, pos_y = self._encode_xy(x, y)
         return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
     encode = encode_boxes  # Backwards compatibility
     @torch.no_grad()
-    def encode_points(self, x, y, labels):
-        """Encodes 2D points with sinusoidal embeddings and appends labels."""
+    def encode_points(self, x: Tensor, y: Tensor, labels: Tensor) -> Tensor:
+        """Encode 2D points with sinusoidal embeddings and append labels."""
         (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
         assert bx == by and nx == ny and bx == bl and nx == nl
         pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
@@ -777,8 +777,8 @@ class PositionEmbeddingSine(nn.Module):
         return torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
     @torch.no_grad()
-    def forward(self, x: torch.Tensor):
-        """Generates sinusoidal position embeddings for 2D inputs like images."""
+    def forward(self, x: torch.Tensor) -> Tensor:
+        """Generate sinusoidal position embeddings for 2D inputs like images."""
         cache_key = (x.shape[-2], x.shape[-1])
         if cache_key in self.cache:
             return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1)
@@ -834,7 +834,7 @@ class PositionEmbeddingRandom(nn.Module):
     """
     def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
-        """Initializes random spatial frequency position embedding for transformers."""
+        """Initialize random spatial frequency position embedding for transformers."""
         super().__init__()
         if scale is None or scale <= 0.0:
             scale = 1.0
@@ -845,7 +845,7 @@ class PositionEmbeddingRandom(nn.Module):
         torch.backends.cudnn.deterministic = False
     def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
-        """Encodes normalized [0,1] coordinates using random spatial frequencies."""
+        """Encode normalized [0,1] coordinates using random spatial frequencies."""
         # Assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
         coords = 2 * coords - 1
         coords = coords @ self.positional_encoding_gaussian_matrix
@@ -854,7 +854,7 @@ class PositionEmbeddingRandom(nn.Module):
         return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
     def forward(self, size: Tuple[int, int]) -> torch.Tensor:
-        """Generates positional encoding for a grid using random spatial frequencies."""
+        """Generate positional encoding for a grid using random spatial frequencies."""
         h, w = size
         device: Any = self.positional_encoding_gaussian_matrix.device
         grid = torch.ones((h, w), device=device, dtype=torch.float32)
@@ -867,7 +867,7 @@ class PositionEmbeddingRandom(nn.Module):
         return pe.permute(2, 0, 1)  # C x H x W
     def forward_with_coords(self, coords_input: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
-        """Positionally encodes input coordinates, normalizing them to [0,1] based on the given image size."""
+        """Positionally encode input coordinates, normalizing them to [0,1] based on the given image size."""
         coords = coords_input.clone()
         coords[:, :, 0] = coords[:, :, 0] / image_size[1]
         coords[:, :, 1] = coords[:, :, 1] / image_size[0]
@@ -915,7 +915,7 @@ class Block(nn.Module):
         input_size: Optional[Tuple[int, int]] = None,
     ) -> None:
         """
-        Initializes a transformer block with optional window attention and relative positional embeddings.
+        Initialize a transformer block with optional window attention and relative positional embeddings.
         This constructor sets up a transformer block that can use either global or windowed self-attention,
         followed by a feed-forward network. It supports relative positional embeddings and is designed
@@ -931,7 +931,7 @@ class Block(nn.Module):
             use_rel_pos (bool): If True, uses relative positional embeddings in attention.
             rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
             window_size (int): Size of attention window. If 0, uses global attention.
-            input_size (Optional[Tuple[int, int]]): Input resolution for calculating relative positional parameter size.
+            input_size (Tuple[int, int] | None): Input resolution for calculating relative positional parameter size.
         Examples:
             >>> block = Block(dim=256, num_heads=8, window_size=7)
@@ -957,7 +957,7 @@ class Block(nn.Module):
         self.window_size = window_size
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Processes input through transformer block with optional windowed self-attention and residual connection."""
+        """Process input through transformer block with optional windowed self-attention and residual connection."""
         shortcut = x
         x = self.norm1(x)
         # Window partition
@@ -976,34 +976,30 @@ class Block(nn.Module):
 class REAttention(nn.Module):
     """
-    Rotary Embedding Attention module for efficient self-attention in transformer architectures.
+    Relative Position Attention module for efficient self-attention in transformer architectures.
-    This class implements a multi-head attention mechanism with rotary positional embeddings, designed
+    This class implements a multi-head attention mechanism with relative positional embeddings, designed
     for use in vision transformer models. It supports optional query pooling and window partitioning
     for efficient processing of large inputs.
     Attributes:
-        compute_cis (Callable): Function to compute axial complex numbers for rotary encoding.
-        freqs_cis (Tensor): Precomputed frequency tensor for rotary encoding.
-        rope_k_repeat (bool): Flag to repeat query RoPE to match key length for cross-attention to memories.
-        q_proj (nn.Linear): Linear projection for query.
-        k_proj (nn.Linear): Linear projection for key.
-        v_proj (nn.Linear): Linear projection for value.
-        out_proj (nn.Linear): Output projection.
         num_heads (int): Number of attention heads.
-        internal_dim (int): Internal dimension for attention computation.
+        scale (float): Scaling factor for attention computation.
+        qkv (nn.Linear): Linear projection for query, key, and value.
+        proj (nn.Linear): Output projection layer.
+        use_rel_pos (bool): Whether to use relative positional embeddings.
+        rel_pos_h (nn.Parameter): Relative positional embeddings for height dimension.
+        rel_pos_w (nn.Parameter): Relative positional embeddings for width dimension.
     Methods:
-        forward: Applies rotary position encoding and computes attention between query, key, and value tensors.
+        forward: Applies multi-head attention with optional relative positional encoding to input tensor.
     Examples:
-        >>> rope_attn = REAttention(embedding_dim=256, num_heads=8, rope_theta=10000.0, feat_sizes=(32, 32))
-        >>> q = torch.randn(1, 1024, 256)
-        >>> k = torch.randn(1, 1024, 256)
-        >>> v = torch.randn(1, 1024, 256)
-        >>> output = rope_attn(q, k, v)
+        >>> attention = REAttention(dim=256, num_heads=8, input_size=(32, 32))
+        >>> x = torch.randn(1, 32, 32, 256)
+        >>> output = attention(x)
         >>> print(output.shape)
-        torch.Size([1, 1024, 256])
+        torch.Size([1, 32, 32, 256])
     """
     def __init__(
@@ -1016,19 +1012,19 @@ class REAttention(nn.Module):
         input_size: Optional[Tuple[int, int]] = None,
     ) -> None:
         """
-        Initializes a Relative Position Attention module for transformer-based architectures.
+        Initialize a Relative Position Attention module for transformer-based architectures.
         This module implements multi-head attention with optional relative positional encodings, designed
         specifically for vision tasks in transformer models.
         Args:
             dim (int): Number of input channels.
-            num_heads (int): Number of attention heads. Default is 8.
-            qkv_bias (bool): If True, adds a learnable bias to query, key, value projections. Default is True.
-            use_rel_pos (bool): If True, uses relative positional encodings. Default is False.
-            rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero. Default is True.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool): If True, adds a learnable bias to query, key, value projections.
+            use_rel_pos (bool): If True, uses relative positional encodings.
+            rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
             input_size (Tuple[int, int] | None): Input resolution for calculating relative positional parameter size.
-                Required if use_rel_pos is True. Default is None.
+                Required if use_rel_pos is True.
         Examples:
             >>> attention = REAttention(dim=256, num_heads=8, input_size=(32, 32))
@@ -1053,7 +1049,7 @@ class REAttention(nn.Module):
             self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Applies multi-head attention with optional relative positional encoding to input tensor."""
+        """Apply multi-head attention with optional relative positional encoding to input tensor."""
         B, H, W, _ = x.shape
         # qkv with shape (3, B, nHead, H * W, C)
         qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
@@ -1101,7 +1097,7 @@ class PatchEmbed(nn.Module):
         embed_dim: int = 768,
     ) -> None:
         """
-        Initializes the PatchEmbed module for converting image patches to embeddings.
+        Initialize the PatchEmbed module for converting image patches to embeddings.
         This module is typically used as the first layer in vision transformer architectures to transform
         image data into a suitable format for subsequent transformer blocks.
@@ -1125,5 +1121,5 @@ class PatchEmbed(nn.Module):
         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Computes patch embedding by applying convolution and transposing resulting tensor."""
+        """Compute patch embedding by applying convolution and transposing resulting tensor."""
         return self.proj(x).permute(0, 2, 3, 1)  # B C H W -> B H W C

ultralytics/models/sam/modules/decoders.py CHANGED Viewed

@@ -27,7 +27,7 @@ class MaskDecoder(nn.Module):
         iou_prediction_head (nn.Module): MLP for predicting mask quality.
     Methods:
-        forward: Predicts masks given image and prompt embeddings.
+        forward: Predict masks given image and prompt embeddings.
         predict_masks: Internal method for mask prediction.
     Examples:
@@ -129,7 +129,6 @@ class MaskDecoder(nn.Module):
         masks = masks[:, mask_slice, :, :]
         iou_pred = iou_pred[:, mask_slice]
-        # Prepare output
         return masks, iou_pred
     def predict_masks(
@@ -201,10 +200,10 @@ class SAM2MaskDecoder(nn.Module):
         dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
     Methods:
-        forward: Predicts masks given image and prompt embeddings.
-        predict_masks: Predicts instance segmentation masks from image and prompt embeddings.
-        _get_stability_scores: Computes mask stability scores based on IoU between thresholds.
-        _dynamic_multimask_via_stability: Dynamically selects the most stable mask output.
+        forward: Predict masks given image and prompt embeddings.
+        predict_masks: Predict instance segmentation masks from image and prompt embeddings.
+        _get_stability_scores: Compute mask stability scores based on IoU between thresholds.
+        _dynamic_multimask_via_stability: Dynamically select the most stable mask output.
     Examples:
         >>> image_embeddings = torch.rand(1, 256, 64, 64)
@@ -330,7 +329,7 @@ class SAM2MaskDecoder(nn.Module):
             dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
             multimask_output (bool): Whether to return multiple masks or a single mask.
             repeat_image (bool): Flag to repeat the image embeddings.
-            high_res_features (List[torch.Tensor] | None): Optional high-resolution features.
+            high_res_features (List[torch.Tensor] | None, optional): Optional high-resolution features.
         Returns:
             masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
@@ -377,7 +376,6 @@ class SAM2MaskDecoder(nn.Module):
             # are always the single mask token (and we'll let it be the object-memory token).
             sam_tokens_out = mask_tokens_out[:, 0:1]  # [b, 1, c] shape
-        # Prepare output
         return masks, iou_pred, sam_tokens_out, object_score_logits
     def predict_masks(

ultralytics/models/sam/modules/encoders.py CHANGED Viewed

@@ -35,7 +35,7 @@ class ImageEncoderViT(nn.Module):
         neck (nn.Sequential): Neck module to further process the output.
     Methods:
-        forward: Processes input through patch embedding, positional embedding, blocks, and neck.
+        forward: Process input through patch embedding, positional embedding, blocks, and neck.
     Examples:
         >>> import torch
@@ -103,7 +103,7 @@ class ImageEncoderViT(nn.Module):
         self.pos_embed: Optional[nn.Parameter] = None
         if use_abs_pos:
-            # Initialize absolute positional embedding with pretrain image size.
+            # Initialize absolute positional embedding with pretrain image size
             self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
         self.blocks = nn.ModuleList()
@@ -157,7 +157,7 @@ class ImageEncoderViT(nn.Module):
 class PromptEncoder(nn.Module):
     """
-    Encodes different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
+    Encode different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
     Attributes:
         embed_dim (int): Dimension of the embeddings.
@@ -172,8 +172,8 @@ class PromptEncoder(nn.Module):
         no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
     Methods:
-        get_dense_pe: Returns the positional encoding used to encode point prompts.
-        forward: Embeds different types of prompts, returning both sparse and dense embeddings.
+        get_dense_pe: Return the positional encoding used to encode point prompts.
+        forward: Embed different types of prompts, returning both sparse and dense embeddings.
     Examples:
         >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
@@ -321,9 +321,8 @@ class PromptEncoder(nn.Module):
             masks (torch.Tensor | None): Masks to embed with shape (B, 1, H, W).
         Returns:
-            (Tuple[torch.Tensor, torch.Tensor]): A tuple containing:
-                - sparse_embeddings (torch.Tensor): Sparse embeddings for points and boxes with shape (B, N, embed_dim).
-                - dense_embeddings (torch.Tensor): Dense embeddings for masks of shape (B, embed_dim, embed_H, embed_W).
+            sparse_embeddings (torch.Tensor): Sparse embeddings for points and boxes with shape (B, N, embed_dim).
+            dense_embeddings (torch.Tensor): Dense embeddings for masks of shape (B, embed_dim, embed_H, embed_W).
         Examples:
             >>> encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
@@ -394,7 +393,7 @@ class MemoryEncoder(nn.Module):
         Args:
             out_dim (int): Output dimension of the encoded features.
-            in_dim (int): Input dimension of the pixel features. Default is 256.
+            in_dim (int): Input dimension of the pixel features.
         Examples:
             >>> encoder = MemoryEncoder(out_dim=256, in_dim=256)
@@ -420,7 +419,7 @@ class MemoryEncoder(nn.Module):
         pix_feat: torch.Tensor,
         masks: torch.Tensor,
         skip_mask_sigmoid: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> dict:
         """Process pixel features and masks to generate encoded memory representations for segmentation."""
         if not skip_mask_sigmoid:
             masks = F.sigmoid(masks)
@@ -499,7 +498,7 @@ class ImageEncoder(nn.Module):
         )
     def forward(self, sample: torch.Tensor):
-        """Encode input through patch embedding, positional embedding, transformer blocks, and neck module."""
+        """Encode input through trunk and neck networks, returning multiscale features and positional encodings."""
         features, pos = self.neck(self.trunk(sample))
         if self.scalp > 0:
             # Discard the lowest resolution features
@@ -552,7 +551,7 @@ class FpnNeck(nn.Module):
         fpn_top_down_levels: Optional[List[int]] = None,
     ):
         """
-        Initializes a modified Feature Pyramid Network (FPN) neck.
+        Initialize a modified Feature Pyramid Network (FPN) neck.
         This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
         similar to ViT positional embedding interpolation.
@@ -594,18 +593,18 @@ class FpnNeck(nn.Module):
         assert fuse_type in {"sum", "avg"}
         self.fuse_type = fuse_type
-        # levels to have top-down features in its outputs
+        # Levels to have top-down features in its outputs
         # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
         # have top-down propagation, while outputs of level 0 and level 1 have only
-        # lateral features from the same backbone level.
+        # lateral features from the same backbone level
         if fpn_top_down_levels is None:
-            # default is to have top-down features on all levels
+            # Default is to have top-down features on all levels
             fpn_top_down_levels = range(len(self.convs))
         self.fpn_top_down_levels = list(fpn_top_down_levels)
     def forward(self, xs: List[torch.Tensor]):
         """
-        Performs forward pass through the Feature Pyramid Network (FPN) neck.
+        Perform forward pass through the Feature Pyramid Network (FPN) neck.
         This method processes a list of input tensors from the backbone through the FPN, applying lateral connections
         and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
@@ -614,10 +613,9 @@ class FpnNeck(nn.Module):
             xs (List[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
         Returns:
-            (Tuple[List[torch.Tensor], List[torch.Tensor]]): A tuple containing:
-                - out (List[torch.Tensor]): List of output feature maps after FPN processing, each with shape
-                  (B, d_model, H, W).
-                - pos (List[torch.Tensor]): List of positional encodings corresponding to each output feature map.
+            out (List[torch.Tensor]): List of output feature maps after FPN processing, each with shape
+                (B, d_model, H, W).
+            pos (List[torch.Tensor]): List of positional encodings corresponding to each output feature map.
         Examples:
             >>> fpn_neck = FpnNeck(d_model=256, backbone_channel_list=[64, 128, 256, 512])
@@ -629,10 +627,10 @@ class FpnNeck(nn.Module):
         out = [None] * len(self.convs)
         pos = [None] * len(self.convs)
         assert len(xs) == len(self.convs)
-        # fpn forward pass
+        # FPN forward pass
         # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
         prev_features = None
-        # forward in top-down order (from low to high resolution)
+        # Forward in top-down order (from low to high resolution)
         n = len(self.convs) - 1
         for i in range(n, -1, -1):
             x = xs[i]
@@ -763,7 +761,7 @@ class Hiera(nn.Module):
             stride=(4, 4),
             padding=(3, 3),
         )
-        # Which blocks have global att?
+        # Which blocks have global attention?
         self.global_att_blocks = global_att_blocks
         # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
@@ -778,8 +776,7 @@ class Hiera(nn.Module):
         for i in range(depth):
             dim_out = embed_dim
-            # lags by a block, so first block of
-            # next stage uses an initial window size
+            # Lags by a block, so first block of next stage uses an initial window size
             # of previous stage and final window size of current stage
             window_size = self.window_spec[cur_stage - 1]
@@ -841,7 +838,7 @@ class Hiera(nn.Module):
         x = self.patch_embed(x)
         # x: (B, H, W, C)
-        # Add pos embed
+        # Add positional embedding
         x = x + self._get_pos_embed(x.shape[1:3])
         outputs = []

ultralytics/models/sam/modules/memory_attention.py CHANGED Viewed

@@ -144,7 +144,19 @@ class MemoryAttentionLayer(nn.Module):
         query_pos: Optional[Tensor] = None,
         num_k_exclude_rope: int = 0,
     ) -> torch.Tensor:
-        """Process input tensors through self-attention, cross-attention, and feedforward network layers."""
+        """
+        Process input tensors through self-attention, cross-attention, and feedforward network layers.
+        Args:
+            tgt (Tensor): Target tensor for self-attention with shape (N, L, D).
+            memory (Tensor): Memory tensor for cross-attention with shape (N, S, D).
+            pos (Optional[Tensor]): Positional encoding for memory tensor.
+            query_pos (Optional[Tensor]): Positional encoding for target tensor.
+            num_k_exclude_rope (int): Number of keys to exclude from rotary position embedding.
+        Returns:
+            (torch.Tensor): Processed tensor after attention and feedforward layers with shape (N, L, D).
+        """
         tgt = self._forward_sa(tgt, query_pos)
         tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
         # MLP

dgenerate-ultralytics-headless 8.3.141__py3-none-any.whl → 8.3.144__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.141py3-none-any.whl → 8.3.144py3-none-any.whl