PyPI - ultralytics - Versions diffs - 8.3.89__py3-none-any.whl → 8.3.91__py3-none-any.whl - Mend

ultralytics 8.3.89py3-none-any.whl → 8.3.91py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (156) hide show

tests/conftest.py +2 -2
tests/test_cli.py +13 -11
tests/test_cuda.py +10 -1
tests/test_exports.py +2 -2
tests/test_integrations.py +1 -5
tests/test_python.py +16 -16
tests/test_solutions.py +9 -9
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +3 -1
ultralytics/cfg/models/11/yolo11-cls.yaml +5 -5
ultralytics/cfg/models/11/yolo11-obb.yaml +5 -5
ultralytics/cfg/models/11/yolo11-pose.yaml +5 -5
ultralytics/cfg/models/11/yolo11-seg.yaml +5 -5
ultralytics/cfg/models/11/yolo11.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-ghost.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-obb.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-world.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-worldv2.yaml +5 -5
ultralytics/cfg/models/v8/yolov8.yaml +5 -5
ultralytics/cfg/models/v9/yolov9c-seg.yaml +1 -1
ultralytics/cfg/models/v9/yolov9c.yaml +1 -1
ultralytics/cfg/models/v9/yolov9e-seg.yaml +1 -1
ultralytics/cfg/models/v9/yolov9e.yaml +1 -1
ultralytics/cfg/models/v9/yolov9m.yaml +1 -1
ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
ultralytics/cfg/models/v9/yolov9t.yaml +1 -1
ultralytics/data/annotator.py +9 -14
ultralytics/data/base.py +118 -30
ultralytics/data/build.py +63 -24
ultralytics/data/converter.py +5 -5
ultralytics/data/dataset.py +207 -53
ultralytics/data/loaders.py +1 -0
ultralytics/data/split_dota.py +39 -12
ultralytics/data/utils.py +15 -19
ultralytics/engine/exporter.py +24 -23
ultralytics/engine/model.py +67 -88
ultralytics/engine/predictor.py +106 -21
ultralytics/engine/trainer.py +32 -23
ultralytics/engine/tuner.py +21 -18
ultralytics/engine/validator.py +75 -41
ultralytics/hub/__init__.py +12 -13
ultralytics/hub/auth.py +9 -12
ultralytics/hub/session.py +76 -21
ultralytics/hub/utils.py +19 -17
ultralytics/models/fastsam/model.py +20 -11
ultralytics/models/fastsam/predict.py +36 -16
ultralytics/models/fastsam/utils.py +5 -5
ultralytics/models/fastsam/val.py +6 -6
ultralytics/models/nas/model.py +22 -11
ultralytics/models/nas/predict.py +9 -4
ultralytics/models/nas/val.py +5 -5
ultralytics/models/rtdetr/model.py +20 -11
ultralytics/models/rtdetr/predict.py +18 -15
ultralytics/models/rtdetr/train.py +20 -16
ultralytics/models/rtdetr/val.py +42 -6
ultralytics/models/sam/__init__.py +1 -1
ultralytics/models/sam/amg.py +50 -4
ultralytics/models/sam/model.py +8 -14
ultralytics/models/sam/modules/decoders.py +18 -21
ultralytics/models/sam/modules/encoders.py +25 -46
ultralytics/models/sam/modules/memory_attention.py +19 -15
ultralytics/models/sam/modules/sam.py +18 -25
ultralytics/models/sam/modules/tiny_encoder.py +19 -29
ultralytics/models/sam/modules/transformer.py +35 -57
ultralytics/models/sam/modules/utils.py +15 -15
ultralytics/models/sam/predict.py +0 -3
ultralytics/models/utils/loss.py +87 -36
ultralytics/models/utils/ops.py +26 -31
ultralytics/models/yolo/classify/predict.py +24 -3
ultralytics/models/yolo/classify/train.py +77 -10
ultralytics/models/yolo/classify/val.py +40 -15
ultralytics/models/yolo/detect/predict.py +23 -10
ultralytics/models/yolo/detect/train.py +85 -15
ultralytics/models/yolo/detect/val.py +145 -21
ultralytics/models/yolo/model.py +1 -2
ultralytics/models/yolo/obb/predict.py +12 -4
ultralytics/models/yolo/obb/train.py +7 -0
ultralytics/models/yolo/obb/val.py +25 -7
ultralytics/models/yolo/pose/predict.py +22 -6
ultralytics/models/yolo/pose/train.py +17 -1
ultralytics/models/yolo/pose/val.py +46 -21
ultralytics/models/yolo/segment/predict.py +22 -8
ultralytics/models/yolo/segment/train.py +6 -0
ultralytics/models/yolo/segment/val.py +100 -14
ultralytics/models/yolo/world/train.py +38 -8
ultralytics/models/yolo/world/train_world.py +39 -10
ultralytics/nn/autobackend.py +28 -14
ultralytics/nn/modules/__init__.py +3 -0
ultralytics/nn/modules/activation.py +12 -3
ultralytics/nn/modules/block.py +587 -84
ultralytics/nn/modules/conv.py +418 -54
ultralytics/nn/modules/head.py +3 -4
ultralytics/nn/modules/transformer.py +320 -34
ultralytics/nn/modules/utils.py +17 -3
ultralytics/nn/tasks.py +221 -69
ultralytics/solutions/ai_gym.py +2 -2
ultralytics/solutions/analytics.py +4 -4
ultralytics/solutions/heatmap.py +4 -4
ultralytics/solutions/instance_segmentation.py +10 -4
ultralytics/solutions/object_blurrer.py +2 -2
ultralytics/solutions/object_counter.py +2 -2
ultralytics/solutions/object_cropper.py +2 -2
ultralytics/solutions/parking_management.py +9 -9
ultralytics/solutions/queue_management.py +1 -1
ultralytics/solutions/region_counter.py +2 -2
ultralytics/solutions/security_alarm.py +7 -7
ultralytics/solutions/solutions.py +7 -4
ultralytics/solutions/speed_estimation.py +2 -2
ultralytics/solutions/streamlit_inference.py +6 -6
ultralytics/solutions/trackzone.py +9 -2
ultralytics/solutions/vision_eye.py +4 -4
ultralytics/trackers/basetrack.py +1 -1
ultralytics/trackers/bot_sort.py +23 -22
ultralytics/trackers/byte_tracker.py +4 -4
ultralytics/trackers/track.py +2 -1
ultralytics/trackers/utils/gmc.py +26 -27
ultralytics/trackers/utils/kalman_filter.py +31 -29
ultralytics/trackers/utils/matching.py +7 -7
ultralytics/utils/__init__.py +32 -27
ultralytics/utils/autobatch.py +5 -5
ultralytics/utils/benchmarks.py +111 -18
ultralytics/utils/callbacks/base.py +3 -3
ultralytics/utils/callbacks/clearml.py +11 -11
ultralytics/utils/callbacks/comet.py +42 -24
ultralytics/utils/callbacks/dvc.py +11 -10
ultralytics/utils/callbacks/hub.py +8 -8
ultralytics/utils/callbacks/mlflow.py +1 -1
ultralytics/utils/callbacks/neptune.py +12 -10
ultralytics/utils/callbacks/raytune.py +1 -1
ultralytics/utils/callbacks/tensorboard.py +6 -6
ultralytics/utils/callbacks/wb.py +16 -16
ultralytics/utils/checks.py +116 -35
ultralytics/utils/dist.py +15 -2
ultralytics/utils/downloads.py +13 -9
ultralytics/utils/files.py +12 -13
ultralytics/utils/instance.py +112 -45
ultralytics/utils/loss.py +28 -33
ultralytics/utils/metrics.py +246 -181
ultralytics/utils/ops.py +61 -53
ultralytics/utils/patches.py +8 -6
ultralytics/utils/plotting.py +65 -45
ultralytics/utils/tal.py +88 -57
ultralytics/utils/torch_utils.py +181 -33
ultralytics/utils/triton.py +13 -3
ultralytics/utils/tuner.py +8 -16
{ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/METADATA +1 -1
ultralytics-8.3.91.dist-info/RECORD +250 -0
ultralytics-8.3.89.dist-info/RECORD +0 -250
{ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/LICENSE +0 -0
{ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/WHEEL +0 -0
{ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/entry_points.txt +0 -0
{ultralytics-8.3.89.dist-info → ultralytics-8.3.91.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -176,7 +176,7 @@ class SAM2Model(torch.nn.Module):
         compile_image_encoder: bool = False,
     ):
         """
-        Initializes the SAM2Model for video object segmentation with memory-based tracking.
+        Initialize the SAM2Model for video object segmentation with memory-based tracking.
         Args:
             image_encoder (nn.Module): Visual encoder for extracting image features.
@@ -213,9 +213,9 @@ class SAM2Model(torch.nn.Module):
                 the encoder.
             proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                 encoding in object pointers.
-            use_signed_tpos_enc_to_obj_ptrs (bool): whether to use signed distance (instead of unsigned absolute distance)
-                in the temporal positional encoding in the object pointers, only relevant when both `use_obj_ptrs_in_encoder=True`
-                and `add_tpos_enc_to_obj_ptrs=True`.
+            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance (instead of unsigned absolute distance)
+                in the temporal positional encoding in the object pointers, only relevant when both
+                `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`.
             only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
                 during evaluation.
             pred_obj_scores (bool): Whether to predict if there is an object in the frame.
@@ -332,18 +332,18 @@ class SAM2Model(torch.nn.Module):
     @property
     def device(self):
-        """Returns the device on which the model's parameters are stored."""
+        """Return the device on which the model's parameters are stored."""
         return next(self.parameters()).device
     def forward(self, *args, **kwargs):
-        """Processes image and prompt inputs to generate object masks and scores in video sequences."""
+        """Process image and prompt inputs to generate object masks and scores in video sequences."""
         raise NotImplementedError(
             "Please use the corresponding methods in SAM2VideoPredictor for inference."
             "See notebooks/video_predictor_example.ipynb for an example."
         )
     def _build_sam_heads(self):
-        """Builds SAM-style prompt encoder and mask decoder for image segmentation tasks."""
+        """Build SAM-style prompt encoder and mask decoder for image segmentation tasks."""
         self.sam_prompt_embed_dim = self.hidden_dim
         self.sam_image_embedding_size = self.image_size // self.backbone_stride
@@ -545,7 +545,7 @@ class SAM2Model(torch.nn.Module):
         )
     def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
-        """Processes mask inputs directly as output, bypassing SAM encoder/decoder."""
+        """Process mask inputs directly as output, bypassing SAM encoder/decoder."""
         # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
         out_scale, out_bias = 20.0, -10.0  # sigmoid(-10.0)=4.5398e-05
         mask_inputs_float = mask_inputs.float()
@@ -592,7 +592,7 @@ class SAM2Model(torch.nn.Module):
         )
     def forward_image(self, img_batch: torch.Tensor):
-        """Processes image batch through encoder to extract multi-level features for SAM model."""
+        """Process image batch through encoder to extract multi-level features for SAM model."""
         backbone_out = self.image_encoder(img_batch)
         if self.use_high_res_features_in_sam:
             # precompute projected level 0 and level 1 features in SAM decoder
@@ -602,7 +602,7 @@ class SAM2Model(torch.nn.Module):
         return backbone_out
     def _prepare_backbone_features(self, backbone_out):
-        """Prepares and flattens visual features from the image backbone output for further processing."""
+        """Prepare and flatten visual features from the image backbone output for further processing."""
         assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
         assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
@@ -627,7 +627,7 @@ class SAM2Model(torch.nn.Module):
         num_frames,
         track_in_reverse=False,  # tracking in reverse time order (for demo usage)
     ):
-        """Prepares memory-conditioned features by fusing current frame's visual features with previous memories."""
+        """Prepare memory-conditioned features by fusing current frame's visual features with previous memories."""
         B = current_vision_feats[-1].size(1)  # batch size on this frame
         C = self.hidden_dim
         H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
@@ -788,7 +788,7 @@ class SAM2Model(torch.nn.Module):
         object_score_logits,
         is_mask_from_pts,
     ):
-        """Encodes frame features and masks into a new memory representation for video segmentation."""
+        """Encode frame features and masks into a new memory representation for video segmentation."""
         B = current_vision_feats[-1].size(1)  # batch size on this frame
         C = self.hidden_dim
         H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
@@ -838,7 +838,7 @@ class SAM2Model(torch.nn.Module):
         track_in_reverse,
         prev_sam_mask_logits,
     ):
-        """Performs a single tracking step, updating object masks and memory features based on current frame inputs."""
+        """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
         current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
         # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
         if len(current_vision_feats) > 1:
@@ -893,9 +893,7 @@ class SAM2Model(torch.nn.Module):
         object_score_logits,
         current_out,
     ):
-        """Finally run the memory encoder on the predicted mask to encode, it into a new memory feature (that can be
-        used in future frames).
-        """
+        """Run memory encoder on predicted mask to encode it into a new memory feature for future frames."""
         if run_mem_encoder and self.num_maskmem > 0:
             high_res_masks_for_mem_enc = high_res_masks
             maskmem_features, maskmem_pos_enc = self._encode_new_memory(
@@ -932,7 +930,7 @@ class SAM2Model(torch.nn.Module):
         # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
         prev_sam_mask_logits=None,
     ):
-        """Performs a single tracking step, updating object masks and memory features based on current frame inputs."""
+        """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
         current_out, sam_outputs, _, _ = self._track_step(
             frame_idx,
             is_init_cond_frame,
@@ -970,7 +968,7 @@ class SAM2Model(torch.nn.Module):
         return current_out
     def _use_multimask(self, is_init_cond_frame, point_inputs):
-        """Determines whether to use multiple mask outputs in the SAM head based on configuration and inputs."""
+        """Determine whether to use multiple mask outputs in the SAM head based on configuration and inputs."""
         num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
         return (
             self.multimask_output_in_sam
@@ -980,7 +978,7 @@ class SAM2Model(torch.nn.Module):
     @staticmethod
     def _apply_non_overlapping_constraints(pred_masks):
-        """Applies non-overlapping constraints to masks, keeping the highest scoring object per location."""
+        """Apply non-overlapping constraints to masks, keeping the highest scoring object per location."""
         batch_size = pred_masks.size(0)
         if batch_size == 1:
             return pred_masks
@@ -1001,12 +999,7 @@ class SAM2Model(torch.nn.Module):
         self.binarize_mask_from_pts_for_mem_enc = binarize
     def set_imgsz(self, imgsz):
-        """
-        Set image size to make model compatible with different image sizes.
-        Args:
-            imgsz (Tuple[int, int]): The size of the input image.
-        """
+        """Set image size to make model compatible with different image sizes."""
         self.image_size = imgsz[0]
         self.sam_prompt_encoder.input_image_size = imgsz
         self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16

ultralytics/models/sam/modules/tiny_encoder.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Conv2d_BN(torch.nn.Sequential):
     Attributes:
         c (torch.nn.Conv2d): 2D convolution layer.
-        1 (torch.nn.BatchNorm2d): Batch normalization layer.
+        bn (torch.nn.BatchNorm2d): Batch normalization layer.
     Methods:
         __init__: Initializes the Conv2d_BN with specified parameters.
@@ -265,9 +265,9 @@ class ConvLayer(nn.Module):
             dim (int): The dimensionality of the input and output.
             input_resolution (Tuple[int, int]): The resolution of the input image.
             depth (int): The number of MBConv layers in the block.
-            activation (Callable): Activation function applied after each convolution.
+            activation (nn.Module): Activation function applied after each convolution.
             drop_path (float | List[float]): Drop path rate. Single float or a list of floats for each MBConv.
-            downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
+            downsample (Optional[nn.Module]): Function for downsampling the output. None to skip downsampling.
             use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
             out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
             conv_expand_ratio (float): Expansion ratio for the MBConv layers.
@@ -413,12 +413,9 @@ class Attention(torch.nn.Module):
         Args:
             dim (int): The dimensionality of the input and output.
             key_dim (int): The dimensionality of the keys and queries.
-            num_heads (int): Number of attention heads. Default is 8.
-            attn_ratio (float): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
-            resolution (Tuple[int, int]): Spatial resolution of the input feature map. Default is (14, 14).
-        Raises:
-            AssertionError: If 'resolution' is not a tuple of length 2.
+            num_heads (int): Number of attention heads.
+            attn_ratio (float): Attention ratio, affecting the dimensions of the value vectors.
+            resolution (Tuple[int, int]): Spatial resolution of the input feature map.
         Examples:
             >>> attn = Attention(dim=256, key_dim=64, num_heads=8, resolution=(14, 14))
@@ -821,22 +818,20 @@ class TinyViT(nn.Module):
         attention and convolution blocks, and a classification head.
         Args:
-            img_size (int): Size of the input image. Default is 224.
-            in_chans (int): Number of input channels. Default is 3.
-            num_classes (int): Number of classes for classification. Default is 1000.
+            img_size (int): Size of the input image.
+            in_chans (int): Number of input channels.
+            num_classes (int): Number of classes for classification.
             embed_dims (Tuple[int, int, int, int]): Embedding dimensions for each stage.
-                Default is (96, 192, 384, 768).
-            depths (Tuple[int, int, int, int]): Number of blocks in each stage. Default is (2, 2, 6, 2).
+            depths (Tuple[int, int, int, int]): Number of blocks in each stage.
             num_heads (Tuple[int, int, int, int]): Number of attention heads in each stage.
-                Default is (3, 6, 12, 24).
-            window_sizes (Tuple[int, int, int, int]): Window sizes for each stage. Default is (7, 7, 14, 7).
-            mlp_ratio (float): Ratio of MLP hidden dim to embedding dim. Default is 4.0.
-            drop_rate (float): Dropout rate. Default is 0.0.
-            drop_path_rate (float): Stochastic depth rate. Default is 0.1.
-            use_checkpoint (bool): Whether to use checkpointing to save memory. Default is False.
-            mbconv_expand_ratio (float): Expansion ratio for MBConv layer. Default is 4.0.
-            local_conv_size (int): Kernel size for local convolutions. Default is 3.
-            layer_lr_decay (float): Layer-wise learning rate decay factor. Default is 1.0.
+            window_sizes (Tuple[int, int, int, int]): Window sizes for each stage.
+            mlp_ratio (float): Ratio of MLP hidden dim to embedding dim.
+            drop_rate (float): Dropout rate.
+            drop_path_rate (float): Stochastic depth rate.
+            use_checkpoint (bool): Whether to use checkpointing to save memory.
+            mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
+            local_conv_size (int): Kernel size for local convolutions.
+            layer_lr_decay (float): Layer-wise learning rate decay factor.
         Examples:
             >>> model = TinyViT(img_size=224, num_classes=1000)
@@ -992,12 +987,7 @@ class TinyViT(nn.Module):
         return self.forward_features(x)
     def set_imgsz(self, imgsz=[1024, 1024]):
-        """
-        Set image size to make model compatible with different image sizes.
-        Args:
-            imgsz (Tuple[int, int]): The size of the input image.
-        """
+        """Set image size to make model compatible with different image sizes."""
         imgsz = [s // 4 for s in imgsz]
         self.patches_resolution = imgsz
         for i, layer in enumerate(self.layers):

ultralytics/models/sam/modules/transformer.py CHANGED Viewed

@@ -57,23 +57,6 @@ class TwoWayTransformer(nn.Module):
             mlp_dim (int): Internal channel dimension for the MLP block.
             activation (Type[nn.Module]): Activation function to use in the MLP block.
             attention_downsample_rate (int): Downsampling rate for attention mechanism.
-        Attributes:
-            depth (int): Number of layers in the transformer.
-            embedding_dim (int): Channel dimension for input embeddings.
-            num_heads (int): Number of heads for multihead attention.
-            mlp_dim (int): Internal channel dimension for the MLP block.
-            layers (nn.ModuleList): List of TwoWayAttentionBlock layers.
-            final_attn_token_to_image (Attention): Final attention layer from queries to image.
-            norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.
-        Examples:
-            >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
-            >>> image_embedding = torch.randn(1, 256, 32, 32)
-            >>> image_pe = torch.randn(1, 256, 32, 32)
-            >>> point_embedding = torch.randn(1, 100, 256)
-            >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
-            >>> print(output_queries.shape, output_image.shape)
         """
         super().__init__()
         self.depth = depth
@@ -104,23 +87,16 @@ class TwoWayTransformer(nn.Module):
         point_embedding: Tensor,
     ) -> Tuple[Tensor, Tensor]:
         """
-        Processes image and point embeddings through the Two-Way Transformer.
+        Process image and point embeddings through the Two-Way Transformer.
         Args:
-            image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
-            image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
-            point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).
+            image_embedding (Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
+            image_pe (Tensor): Positional encoding to add to the image, with same shape as image_embedding.
+            point_embedding (Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).
         Returns:
-            (Tuple[torch.Tensor, torch.Tensor]): Processed point_embedding and image_embedding.
-        Examples:
-            >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
-            >>> image_embedding = torch.randn(1, 256, 32, 32)
-            >>> image_pe = torch.randn(1, 256, 32, 32)
-            >>> point_embedding = torch.randn(1, 100, 256)
-            >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
-            >>> print(output_queries.shape, output_image.shape)
+            queries (Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
+            keys (Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
         """
         # BxCxHxW -> BxHWxC == B x N_image_tokens x C
         image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
@@ -191,7 +167,7 @@ class TwoWayAttentionBlock(nn.Module):
         skip_first_layer_pe: bool = False,
     ) -> None:
         """
-        Initializes a TwoWayAttentionBlock for simultaneous attention to image and query points.
+        Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.
         This block implements a specialized transformer layer with four main components: self-attention on sparse
         inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
@@ -204,15 +180,6 @@ class TwoWayAttentionBlock(nn.Module):
             activation (Type[nn.Module]): Activation function for the MLP block.
             attention_downsample_rate (int): Downsampling rate for the attention mechanism.
             skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.
-        Examples:
-            >>> embedding_dim, num_heads = 256, 8
-            >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
-            >>> queries = torch.randn(1, 100, embedding_dim)
-            >>> keys = torch.randn(1, 1000, embedding_dim)
-            >>> query_pe = torch.randn(1, 100, embedding_dim)
-            >>> key_pe = torch.randn(1, 1000, embedding_dim)
-            >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
         """
         super().__init__()
         self.self_attn = Attention(embedding_dim, num_heads)
@@ -230,7 +197,19 @@ class TwoWayAttentionBlock(nn.Module):
         self.skip_first_layer_pe = skip_first_layer_pe
     def forward(self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor) -> Tuple[Tensor, Tensor]:
-        """Applies two-way attention to process query and key embeddings in a transformer block."""
+        """
+        Apply two-way attention to process query and key embeddings in a transformer block.
+        Args:
+            queries (Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
+            keys (Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
+            query_pe (Tensor): Positional encodings for queries with same shape as queries.
+            key_pe (Tensor): Positional encodings for keys with same shape as keys.
+        Returns:
+            queries (Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
+            keys (Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
+        """
         # Self attention block
         if self.skip_first_layer_pe:
             queries = self.self_attn(q=queries, k=queries, v=queries)
@@ -301,27 +280,16 @@ class Attention(nn.Module):
         kv_in_dim: int = None,
     ) -> None:
         """
-        Initializes the Attention module with specified dimensions and settings.
-        This class implements a multi-head attention mechanism with optional downsampling of the internal
-        dimension for queries, keys, and values.
+        Initialize the Attention module with specified dimensions and settings.
         Args:
             embedding_dim (int): Dimensionality of input embeddings.
             num_heads (int): Number of attention heads.
-            downsample_rate (int): Factor by which internal dimensions are downsampled. Defaults to 1.
+            downsample_rate (int): Factor by which internal dimensions are downsampled.
             kv_in_dim (int | None): Dimensionality of key and value inputs. If None, uses embedding_dim.
         Raises:
             AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
-        Examples:
-            >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
-            >>> q = torch.randn(1, 100, 256)
-            >>> k = v = torch.randn(1, 50, 256)
-            >>> output = attn(q, k, v)
-            >>> print(output.shape)
-            torch.Size([1, 100, 256])
         """
         super().__init__()
         self.embedding_dim = embedding_dim
@@ -337,20 +305,30 @@ class Attention(nn.Module):
     @staticmethod
     def _separate_heads(x: Tensor, num_heads: int) -> Tensor:
-        """Separates the input tensor into the specified number of attention heads."""
+        """Separate the input tensor into the specified number of attention heads."""
         b, n, c = x.shape
         x = x.reshape(b, n, num_heads, c // num_heads)
         return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
     @staticmethod
     def _recombine_heads(x: Tensor) -> Tensor:
-        """Recombines separated attention heads into a single tensor."""
+        """Recombine separated attention heads into a single tensor."""
         b, n_heads, n_tokens, c_per_head = x.shape
         x = x.transpose(1, 2)
         return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
     def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
-        """Applies multi-head attention to query, key, and value tensors with optional downsampling."""
+        """
+        Apply multi-head attention to query, key, and value tensors with optional downsampling.
+        Args:
+            q (Tensor): Query tensor with shape (B, N_q, embedding_dim).
+            k (Tensor): Key tensor with shape (B, N_k, embedding_dim).
+            v (Tensor): Value tensor with shape (B, N_k, embedding_dim).
+        Returns:
+            (Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
+        """
         # Input projections
         q = self.q_proj(q)
         k = self.k_proj(k)

ultralytics/models/sam/modules/utils.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch.nn.functional as F
 def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
     """
-    Selects the closest conditioning frames to a given frame index.
+    Select the closest conditioning frames to a given frame index.
     Args:
         frame_idx (int): Current frame index.
@@ -37,17 +37,17 @@ def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num
         assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
         selected_outputs = {}
-        # the closest conditioning frame before `frame_idx` (if any)
+        # The closest conditioning frame before `frame_idx` (if any)
         idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
         if idx_before is not None:
             selected_outputs[idx_before] = cond_frame_outputs[idx_before]
-        # the closest conditioning frame after `frame_idx` (if any)
+        # The closest conditioning frame after `frame_idx` (if any)
         idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
         if idx_after is not None:
             selected_outputs[idx_after] = cond_frame_outputs[idx_after]
-        # add other temporally closest conditioning frames until reaching a total
+        # Add other temporally closest conditioning frames until reaching a total
         # of `max_cond_frame_num` conditioning frames.
         num_remain = max_cond_frame_num - len(selected_outputs)
         inds_remain = sorted(
@@ -61,7 +61,7 @@ def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num
 def get_1d_sine_pe(pos_inds, dim, temperature=10000):
-    """Generates 1D sinusoidal positional embeddings for given positions and dimensions."""
+    """Generate 1D sinusoidal positional embeddings for given positions and dimensions."""
     pe_dim = dim // 2
     dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
     dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
@@ -72,7 +72,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000):
 def init_t_xy(end_x: int, end_y: int):
-    """Initializes 1D and 2D coordinate tensors for a grid of specified dimensions."""
+    """Initialize 1D and 2D coordinate tensors for a grid of specified dimensions."""
     t = torch.arange(end_x * end_y, dtype=torch.float32)
     t_x = (t % end_x).float()
     t_y = torch.div(t, end_x, rounding_mode="floor").float()
@@ -80,7 +80,7 @@ def init_t_xy(end_x: int, end_y: int):
 def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
-    """Computes axial complex exponential positional encodings for 2D spatial positions in a grid."""
+    """Compute axial complex exponential positional encodings for 2D spatial positions in a grid."""
     freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
     freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
@@ -93,7 +93,7 @@ def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
 def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    """Reshapes frequency tensor for broadcasting with input tensor, ensuring dimensional compatibility."""
+    """Reshape frequency tensor for broadcasting with input tensor, ensuring dimensional compatibility."""
     ndim = x.ndim
     assert 0 <= 1 < ndim
     assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
@@ -107,15 +107,15 @@ def apply_rotary_enc(
     freqs_cis: torch.Tensor,
     repeat_freqs_k: bool = False,
 ):
-    """Applies rotary positional encoding to query and key tensors using complex-valued frequency components."""
+    """Apply rotary positional encoding to query and key tensors using complex-valued frequency components."""
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) if xk.shape[-2] != 0 else None
     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
     xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
     if xk_ is None:
-        # no keys to rotate, due to dropout
+        # No keys to rotate, due to dropout
         return xq_out.type_as(xq).to(xq.device), xk
-    # repeat freqs along seq_len dim to match k seq_len
+    # Repeat freqs along seq_len dim to match k seq_len
     if repeat_freqs_k:
         r = xk_.shape[-2] // xq_.shape[-2]
         freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
@@ -125,7 +125,7 @@ def apply_rotary_enc(
 def window_partition(x, window_size):
     """
-    Partitions input tensor into non-overlapping windows with padding if needed.
+    Partition input tensor into non-overlapping windows with padding if needed.
     Args:
         x (torch.Tensor): Input tensor with shape (B, H, W, C).
@@ -157,7 +157,7 @@ def window_partition(x, window_size):
 def window_unpartition(windows, window_size, pad_hw, hw):
     """
-    Unpartitions windowed sequences into original sequences and removes padding.
+    Unpartition windowed sequences into original sequences and remove padding.
     This function reverses the windowing process, reconstructing the original input from windowed segments
     and removing any padding that was added during the windowing process.
@@ -195,7 +195,7 @@ def window_unpartition(windows, window_size, pad_hw, hw):
 def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
     """
-    Extracts relative positional embeddings based on query and key sizes.
+    Extract relative positional embeddings based on query and key sizes.
     Args:
         q_size (int): Size of the query.
@@ -244,7 +244,7 @@ def add_decomposed_rel_pos(
     k_size: Tuple[int, int],
 ) -> torch.Tensor:
     """
-    Adds decomposed Relative Positional Embeddings to the attention map.
+    Add decomposed Relative Positional Embeddings to the attention map.
     This function calculates and applies decomposed Relative Positional Embeddings as described in the MVITv2
     paper. It enhances the attention mechanism by incorporating spatial relationships between query and key

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -701,9 +701,6 @@ class SAM2Predictor(Predictor):
             - The method supports batched inference for multiple objects when points or bboxes are provided.
             - Input prompts (bboxes, points) are automatically scaled to match the input image dimensions.
             - When both bboxes and points are provided, they are merged into a single 'points' input for the model.
-        References:
-            - SAM2 Paper: [Add link to SAM2 paper when available]
         """
         features = self.get_im_features(im) if self.features is None else self.features

ultralytics 8.3.89__py3-none-any.whl → 8.3.91__py3-none-any.whl

ultralytics 8.3.89py3-none-any.whl → 8.3.91py3-none-any.whl