PyPI - ultralytics - Versions diffs - 8.3.88__py3-none-any.whl → 8.3.90__py3-none-any.whl - Mend

ultralytics 8.3.88py3-none-any.whl → 8.3.90py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

tests/conftest.py +2 -2
tests/test_cli.py +13 -11
tests/test_cuda.py +10 -1
tests/test_integrations.py +1 -5
tests/test_python.py +16 -16
tests/test_solutions.py +9 -9
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +3 -1
ultralytics/cfg/models/11/yolo11-cls.yaml +5 -5
ultralytics/cfg/models/11/yolo11-obb.yaml +5 -5
ultralytics/cfg/models/11/yolo11-pose.yaml +5 -5
ultralytics/cfg/models/11/yolo11-seg.yaml +5 -5
ultralytics/cfg/models/11/yolo11.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-ghost.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-obb.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-p6.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-world.yaml +5 -5
ultralytics/cfg/models/v8/yolov8-worldv2.yaml +5 -5
ultralytics/cfg/models/v8/yolov8.yaml +5 -5
ultralytics/cfg/models/v9/yolov9c-seg.yaml +1 -1
ultralytics/cfg/models/v9/yolov9c.yaml +1 -1
ultralytics/cfg/models/v9/yolov9e-seg.yaml +1 -1
ultralytics/cfg/models/v9/yolov9e.yaml +1 -1
ultralytics/cfg/models/v9/yolov9m.yaml +1 -1
ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
ultralytics/cfg/models/v9/yolov9t.yaml +1 -1
ultralytics/data/annotator.py +9 -14
ultralytics/data/base.py +125 -39
ultralytics/data/build.py +63 -24
ultralytics/data/converter.py +34 -33
ultralytics/data/dataset.py +207 -53
ultralytics/data/loaders.py +1 -0
ultralytics/data/split_dota.py +39 -12
ultralytics/data/utils.py +33 -47
ultralytics/engine/exporter.py +19 -17
ultralytics/engine/model.py +69 -90
ultralytics/engine/predictor.py +106 -21
ultralytics/engine/trainer.py +32 -23
ultralytics/engine/tuner.py +31 -38
ultralytics/engine/validator.py +75 -41
ultralytics/hub/__init__.py +21 -26
ultralytics/hub/auth.py +9 -12
ultralytics/hub/session.py +76 -21
ultralytics/hub/utils.py +19 -17
ultralytics/models/fastsam/model.py +23 -17
ultralytics/models/fastsam/predict.py +36 -16
ultralytics/models/fastsam/utils.py +5 -5
ultralytics/models/fastsam/val.py +6 -6
ultralytics/models/nas/model.py +29 -24
ultralytics/models/nas/predict.py +14 -11
ultralytics/models/nas/val.py +11 -13
ultralytics/models/rtdetr/model.py +20 -11
ultralytics/models/rtdetr/predict.py +21 -21
ultralytics/models/rtdetr/train.py +25 -24
ultralytics/models/rtdetr/val.py +47 -14
ultralytics/models/sam/__init__.py +1 -1
ultralytics/models/sam/amg.py +50 -4
ultralytics/models/sam/model.py +8 -14
ultralytics/models/sam/modules/decoders.py +18 -21
ultralytics/models/sam/modules/encoders.py +25 -46
ultralytics/models/sam/modules/memory_attention.py +19 -15
ultralytics/models/sam/modules/sam.py +18 -25
ultralytics/models/sam/modules/tiny_encoder.py +19 -29
ultralytics/models/sam/modules/transformer.py +35 -57
ultralytics/models/sam/modules/utils.py +15 -15
ultralytics/models/sam/predict.py +0 -3
ultralytics/models/utils/loss.py +87 -36
ultralytics/models/utils/ops.py +26 -31
ultralytics/models/yolo/classify/predict.py +30 -12
ultralytics/models/yolo/classify/train.py +83 -19
ultralytics/models/yolo/classify/val.py +45 -23
ultralytics/models/yolo/detect/predict.py +29 -19
ultralytics/models/yolo/detect/train.py +90 -23
ultralytics/models/yolo/detect/val.py +150 -29
ultralytics/models/yolo/model.py +1 -2
ultralytics/models/yolo/obb/predict.py +18 -13
ultralytics/models/yolo/obb/train.py +12 -8
ultralytics/models/yolo/obb/val.py +35 -22
ultralytics/models/yolo/pose/predict.py +28 -15
ultralytics/models/yolo/pose/train.py +21 -8
ultralytics/models/yolo/pose/val.py +51 -31
ultralytics/models/yolo/segment/predict.py +27 -16
ultralytics/models/yolo/segment/train.py +11 -8
ultralytics/models/yolo/segment/val.py +110 -29
ultralytics/models/yolo/world/train.py +43 -16
ultralytics/models/yolo/world/train_world.py +61 -36
ultralytics/nn/autobackend.py +28 -14
ultralytics/nn/modules/__init__.py +12 -12
ultralytics/nn/modules/activation.py +12 -3
ultralytics/nn/modules/block.py +587 -84
ultralytics/nn/modules/conv.py +418 -54
ultralytics/nn/modules/head.py +3 -4
ultralytics/nn/modules/transformer.py +320 -34
ultralytics/nn/modules/utils.py +17 -3
ultralytics/nn/tasks.py +226 -79
ultralytics/solutions/ai_gym.py +2 -2
ultralytics/solutions/analytics.py +4 -4
ultralytics/solutions/heatmap.py +4 -4
ultralytics/solutions/instance_segmentation.py +10 -4
ultralytics/solutions/object_blurrer.py +2 -2
ultralytics/solutions/object_counter.py +2 -2
ultralytics/solutions/object_cropper.py +2 -2
ultralytics/solutions/parking_management.py +9 -9
ultralytics/solutions/queue_management.py +1 -1
ultralytics/solutions/region_counter.py +2 -2
ultralytics/solutions/security_alarm.py +7 -7
ultralytics/solutions/solutions.py +7 -4
ultralytics/solutions/speed_estimation.py +2 -2
ultralytics/solutions/streamlit_inference.py +6 -6
ultralytics/solutions/trackzone.py +9 -2
ultralytics/solutions/vision_eye.py +4 -4
ultralytics/trackers/basetrack.py +1 -1
ultralytics/trackers/bot_sort.py +23 -22
ultralytics/trackers/byte_tracker.py +4 -4
ultralytics/trackers/track.py +2 -1
ultralytics/trackers/utils/gmc.py +26 -27
ultralytics/trackers/utils/kalman_filter.py +31 -29
ultralytics/trackers/utils/matching.py +7 -7
ultralytics/utils/__init__.py +37 -35
ultralytics/utils/autobatch.py +5 -5
ultralytics/utils/benchmarks.py +111 -18
ultralytics/utils/callbacks/base.py +3 -3
ultralytics/utils/callbacks/clearml.py +11 -11
ultralytics/utils/callbacks/comet.py +35 -22
ultralytics/utils/callbacks/dvc.py +11 -10
ultralytics/utils/callbacks/hub.py +8 -8
ultralytics/utils/callbacks/mlflow.py +1 -1
ultralytics/utils/callbacks/neptune.py +12 -10
ultralytics/utils/callbacks/raytune.py +1 -1
ultralytics/utils/callbacks/tensorboard.py +6 -6
ultralytics/utils/callbacks/wb.py +16 -16
ultralytics/utils/checks.py +139 -68
ultralytics/utils/dist.py +15 -2
ultralytics/utils/downloads.py +37 -56
ultralytics/utils/files.py +12 -13
ultralytics/utils/instance.py +117 -52
ultralytics/utils/loss.py +28 -33
ultralytics/utils/metrics.py +246 -181
ultralytics/utils/ops.py +65 -61
ultralytics/utils/patches.py +8 -6
ultralytics/utils/plotting.py +72 -59
ultralytics/utils/tal.py +88 -57
ultralytics/utils/torch_utils.py +202 -64
ultralytics/utils/triton.py +13 -3
ultralytics/utils/tuner.py +13 -25
{ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/METADATA +2 -2
ultralytics-8.3.90.dist-info/RECORD +250 -0
ultralytics-8.3.88.dist-info/RECORD +0 -250
{ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/LICENSE +0 -0
{ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/WHEEL +0 -0
{ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/entry_points.txt +0 -0
{ultralytics-8.3.88.dist-info → ultralytics-8.3.90.dist-info}/top_level.txt +0 -0

ultralytics/models/rtdetr/val.py CHANGED Viewed

@@ -22,13 +22,20 @@ class RTDETRDataset(YOLODataset):
         """Initialize the RTDETRDataset class by inheriting from the YOLODataset class."""
         super().__init__(*args, data=data, **kwargs)
-    # NOTE: add stretch version load_image for RTDETR mosaic
     def load_image(self, i, rect_mode=False):
         """Loads 1 image from dataset index 'i', returns (im, resized hw)."""
         return super().load_image(i=i, rect_mode=rect_mode)
     def build_transforms(self, hyp=None):
-        """Temporary, only for evaluation."""
+        """
+        Build transformation pipeline for the dataset.
+        Args:
+            hyp (Dict, optional): Hyperparameters for transformations.
+        Returns:
+            (Compose): Composition of transformation functions.
+        """
         if self.augment:
             hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
             hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
@@ -58,14 +65,11 @@ class RTDETRValidator(DetectionValidator):
     The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for
     post-processing, and updates evaluation metrics accordingly.
-    Example:
-        ```python
-        from ultralytics.models.rtdetr import RTDETRValidator
-        args = dict(model="rtdetr-l.pt", data="coco8.yaml")
-        validator = RTDETRValidator(args=args)
-        validator()
-        ```
+    Examples:
+        >>> from ultralytics.models.rtdetr import RTDETRValidator
+        >>> args = dict(model="rtdetr-l.pt", data="coco8.yaml")
+        >>> validator = RTDETRValidator(args=args)
+        >>> validator()
     Note:
         For further details on the attributes and methods, refer to the parent DetectionValidator class.
@@ -78,7 +82,10 @@ class RTDETRValidator(DetectionValidator):
         Args:
             img_path (str): Path to the folder containing images.
             mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
-            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
+            batch (int, optional): Size of batches, this is for `rect`.
+        Returns:
+            (RTDETRDataset): Dataset configured for RT-DETR validation.
         """
         return RTDETRDataset(
             img_path=img_path,
@@ -93,7 +100,15 @@ class RTDETRValidator(DetectionValidator):
         )
     def postprocess(self, preds):
-        """Apply Non-maximum suppression to prediction outputs."""
+        """
+        Apply Non-maximum suppression to prediction outputs.
+        Args:
+            preds (List | Tuple | torch.Tensor): Raw predictions from the model.
+        Returns:
+            (List[torch.Tensor]): List of processed predictions for each image in batch.
+        """
         if not isinstance(preds, (list, tuple)):  # list for PyTorch inference but list[0] Tensor for export inference
             preds = [preds, None]
@@ -114,7 +129,16 @@ class RTDETRValidator(DetectionValidator):
         return outputs
     def _prepare_batch(self, si, batch):
-        """Prepares a batch for training or inference by applying transformations."""
+        """
+        Prepares a batch for validation by applying necessary transformations.
+        Args:
+            si (int): Batch index.
+            batch (Dict): Batch data containing images and annotations.
+        Returns:
+            (Dict): Prepared batch with transformed annotations.
+        """
         idx = batch["batch_idx"] == si
         cls = batch["cls"][idx].squeeze(-1)
         bbox = batch["bboxes"][idx]
@@ -128,7 +152,16 @@ class RTDETRValidator(DetectionValidator):
         return {"cls": cls, "bbox": bbox, "ori_shape": ori_shape, "imgsz": imgsz, "ratio_pad": ratio_pad}
     def _prepare_pred(self, pred, pbatch):
-        """Prepares and returns a batch with transformed bounding boxes and class labels."""
+        """
+        Prepares predictions by scaling bounding boxes to original image dimensions.
+        Args:
+            pred (torch.Tensor): Raw predictions.
+            pbatch (Dict): Prepared batch information.
+        Returns:
+            (torch.Tensor): Predictions scaled to original image dimensions.
+        """
         predn = pred.clone()
         predn[..., [0, 2]] *= pbatch["ori_shape"][1] / self.args.imgsz  # native-space pred
         predn[..., [1, 3]] *= pbatch["ori_shape"][0] / self.args.imgsz  # native-space pred

ultralytics/models/sam/__init__.py CHANGED Viewed

@@ -3,4 +3,4 @@
 from .model import SAM
 from .predict import Predictor, SAM2Predictor, SAM2VideoPredictor
-__all__ = "SAM", "Predictor", "SAM2Predictor", "SAM2VideoPredictor"  # tuple or list
+__all__ = "SAM", "Predictor", "SAM2Predictor", "SAM2VideoPredictor"  # tuple or list of exportable items

ultralytics/models/sam/amg.py CHANGED Viewed

@@ -76,7 +76,24 @@ def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer:
 def generate_crop_boxes(
     im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
 ) -> Tuple[List[List[int]], List[int]]:
-    """Generates crop boxes of varying sizes for multiscale image processing, with layered overlapping regions."""
+    """
+    Generates crop boxes of varying sizes for multiscale image processing, with layered overlapping regions.
+    Args:
+        im_size (Tuple[int, ...]): Height and width of the input image.
+        n_layers (int): Number of layers to generate crop boxes for.
+        overlap_ratio (float): Ratio of overlap between adjacent crop boxes.
+    Returns:
+        (List[List[int]]): List of crop boxes in [x0, y0, x1, y1] format.
+        (List[int]): List of layer indices corresponding to each crop box.
+    Examples:
+        >>> im_size = (800, 1200)  # Height, width
+        >>> n_layers = 3
+        >>> overlap_ratio = 0.25
+        >>> crop_boxes, layer_idxs = generate_crop_boxes(im_size, n_layers, overlap_ratio)
+    """
     crop_boxes, layer_idxs = [], []
     im_h, im_w = im_size
     short_side = min(im_h, im_w)
@@ -86,7 +103,7 @@ def generate_crop_boxes(
     layer_idxs.append(0)
     def crop_len(orig_len, n_crops, overlap):
-        """Crops bounding boxes to the size of the input image."""
+        """Calculates the length of each crop given the original length, number of crops, and overlap."""
         return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
     for i_layer in range(n_layers):
@@ -140,7 +157,24 @@ def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w:
 def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> Tuple[np.ndarray, bool]:
-    """Removes small disconnected regions or holes in a mask based on area threshold and mode."""
+    """
+    Removes small disconnected regions or holes in a mask based on area threshold and mode.
+    Args:
+        mask (np.ndarray): Binary mask to process.
+        area_thresh (float): Area threshold below which regions will be removed.
+        mode (str): Processing mode, either 'holes' to fill small holes or 'islands' to remove small disconnected regions.
+    Returns:
+        (np.ndarray): Processed binary mask with small regions removed.
+        (bool): Whether any regions were modified.
+    Examples:
+        >>> mask = np.zeros((100, 100), dtype=np.bool_)
+        >>> mask[40:60, 40:60] = True  # Create a square
+        >>> mask[45:55, 45:55] = False  # Create a hole
+        >>> processed_mask, modified = remove_small_regions(mask, 50, "holes")
+    """
     import cv2  # type: ignore
     assert mode in {"holes", "islands"}, f"Provided mode {mode} is invalid"
@@ -160,7 +194,19 @@ def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> Tup
 def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
-    """Calculates bounding boxes in XYXY format around binary masks, handling empty masks and various input shapes."""
+    """
+    Calculates bounding boxes in XYXY format around binary masks.
+    Args:
+        masks (torch.Tensor): Binary masks with shape (B, H, W) or (B, C, H, W).
+    Returns:
+        (torch.Tensor): Bounding boxes in XYXY format with shape (B, 4) or (B, C, 4).
+    Notes:
+        - Handles empty masks by returning zero boxes.
+        - Preserves input tensor dimensions in the output.
+    """
     # torch.max below raises an error on empty inputs, just skip in this case
     if torch.numel(masks) == 0:
         return torch.zeros(*masks.shape[:-2], 4, device=masks.device)

ultralytics/models/sam/model.py CHANGED Viewed

@@ -49,7 +49,7 @@ class SAM(Model):
     def __init__(self, model="sam_b.pt") -> None:
         """
-        Initializes the SAM (Segment Anything Model) instance.
+        Initialize the SAM (Segment Anything Model) instance.
         Args:
             model (str): Path to the pre-trained SAM model file. File should have a .pt or .pth extension.
@@ -68,10 +68,7 @@ class SAM(Model):
     def _load(self, weights: str, task=None):
         """
-        Loads the specified weights into the SAM model.
-        This method initializes the SAM model with the provided weights file, setting up the model architecture
-        and loading the pre-trained parameters.
+        Load the specified weights into the SAM model.
         Args:
             weights (str): Path to the weights file. Should be a .pt or .pth file containing the model parameters.
@@ -85,7 +82,7 @@ class SAM(Model):
     def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs):
         """
-        Performs segmentation prediction on the given image or video source.
+        Perform segmentation prediction on the given image or video source.
         Args:
             source (str | PIL.Image | numpy.ndarray): Path to the image or video file, or a PIL.Image object, or
@@ -112,7 +109,7 @@ class SAM(Model):
     def __call__(self, source=None, stream=False, bboxes=None, points=None, labels=None, **kwargs):
         """
-        Performs segmentation prediction on the given image or video source.
+        Perform segmentation prediction on the given image or video source.
         This method is an alias for the 'predict' method, providing a convenient way to call the SAM model
         for segmentation tasks.
@@ -138,10 +135,7 @@ class SAM(Model):
     def info(self, detailed=False, verbose=True):
         """
-        Logs information about the SAM model.
-        This method provides details about the Segment Anything Model (SAM), including its architecture,
-        parameters, and computational requirements.
+        Log information about the SAM model.
         Args:
             detailed (bool): If True, displays detailed information about the model layers and operations.
@@ -160,16 +154,16 @@ class SAM(Model):
     @property
     def task_map(self):
         """
-        Provides a mapping from the 'segment' task to its corresponding 'Predictor'.
+        Provide a mapping from the 'segment' task to its corresponding 'Predictor'.
         Returns:
-            (Dict[str, Type[Predictor]]): A dictionary mapping the 'segment' task to its corresponding Predictor
+            (Dict[str, Dict[str, Type[Predictor]]]): A dictionary mapping the 'segment' task to its corresponding Predictor
                 class. For SAM2 models, it maps to SAM2Predictor, otherwise to the standard Predictor.
         Examples:
             >>> sam = SAM("sam_b.pt")
             >>> task_map = sam.task_map
             >>> print(task_map)
-            {'segment': <class 'ultralytics.models.sam.predict.Predictor'>}
+            {'segment': {'predictor': <class 'ultralytics.models.sam.predict.Predictor'>}}
         """
         return {"segment": {"predictor": SAM2Predictor if self.is_sam2 else Predictor}}

ultralytics/models/sam/modules/decoders.py CHANGED Viewed

@@ -48,7 +48,7 @@ class MaskDecoder(nn.Module):
         iou_head_hidden_dim: int = 256,
     ) -> None:
         """
-        Initializes the MaskDecoder module for generating masks and their quality scores.
+        Initialize the MaskDecoder module for generating masks and their associated quality scores.
         Args:
             transformer_dim (int): Channel dimension for the transformer module.
@@ -95,7 +95,7 @@ class MaskDecoder(nn.Module):
         multimask_output: bool,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        Predicts masks given image and prompt embeddings.
+        Predict masks given image and prompt embeddings.
         Args:
             image_embeddings (torch.Tensor): Embeddings from the image encoder.
@@ -105,9 +105,8 @@ class MaskDecoder(nn.Module):
             multimask_output (bool): Whether to return multiple masks or a single mask.
         Returns:
-            (Tuple[torch.Tensor, torch.Tensor]): A tuple containing:
-                - masks (torch.Tensor): Batched predicted masks.
-                - iou_pred (torch.Tensor): Batched predictions of mask quality.
+            masks (torch.Tensor): Batched predicted masks.
+            iou_pred (torch.Tensor): Batched predictions of mask quality.
         Examples:
             >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
@@ -140,7 +139,7 @@ class MaskDecoder(nn.Module):
         sparse_prompt_embeddings: torch.Tensor,
         dense_prompt_embeddings: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Predicts masks and quality scores using image and prompt embeddings via transformer architecture."""
+        """Predict masks and quality scores using image and prompt embeddings via transformer architecture."""
         # Concatenate output tokens
         output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
         output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.shape[0], -1, -1)
@@ -236,7 +235,7 @@ class SAM2MaskDecoder(nn.Module):
         use_multimask_token_for_obj_ptr: bool = False,
     ) -> None:
         """
-        Initializes the SAM2MaskDecoder module for predicting instance segmentation masks.
+        Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.
         This decoder extends the functionality of MaskDecoder, incorporating additional features such as
         high-resolution feature processing, dynamic multimask output, and object score prediction.
@@ -320,9 +319,9 @@ class SAM2MaskDecoder(nn.Module):
         multimask_output: bool,
         repeat_image: bool,
         high_res_features: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
-        Predicts masks given image and prompt embeddings.
+        Predict masks given image and prompt embeddings.
         Args:
             image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
@@ -334,11 +333,10 @@ class SAM2MaskDecoder(nn.Module):
             high_res_features (List[torch.Tensor] | None): Optional high-resolution features.
         Returns:
-            (Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]): A tuple containing:
-                - masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
-                - iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
-                - sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
-                - object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).
+            masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
+            iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
+            sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
+            object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).
         Examples:
             >>> image_embeddings = torch.rand(1, 256, 64, 64)
@@ -390,8 +388,8 @@ class SAM2MaskDecoder(nn.Module):
         dense_prompt_embeddings: torch.Tensor,
         repeat_image: bool,
         high_res_features: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Predicts instance segmentation masks from image and prompt embeddings using a transformer."""
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Predict instance segmentation masks from image and prompt embeddings using a transformer."""
         # Concatenate output tokens
         s = 0
         if self.pred_obj_scores:
@@ -454,7 +452,7 @@ class SAM2MaskDecoder(nn.Module):
         return masks, iou_pred, mask_tokens_out, object_score_logits
     def _get_stability_scores(self, mask_logits):
-        """Computes mask stability scores based on IoU between upper and lower thresholds."""
+        """Compute mask stability scores based on IoU between upper and lower thresholds."""
         mask_logits = mask_logits.flatten(-2)
         stability_delta = self.dynamic_multimask_stability_delta
         area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
@@ -463,7 +461,7 @@ class SAM2MaskDecoder(nn.Module):
     def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
         """
-        Dynamically selects the most stable mask output based on stability scores and IoU predictions.
+        Dynamically select the most stable mask output based on stability scores and IoU predictions.
         This method is used when outputting a single mask. If the stability score from the current single-mask
         output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
@@ -476,9 +474,8 @@ class SAM2MaskDecoder(nn.Module):
             all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).
         Returns:
-            (Tuple[torch.Tensor, torch.Tensor]):
-                - mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
-                - iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).
+            mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
+            iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).
         Examples:
             >>> decoder = SAM2MaskDecoder(...)

ultralytics/models/sam/modules/encoders.py CHANGED Viewed

@@ -65,7 +65,7 @@ class ImageEncoderViT(nn.Module):
         global_attn_indexes: Tuple[int, ...] = (),
     ) -> None:
         """
-        Initializes an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
+        Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
         Args:
             img_size (int): Input image size, assumed to be square.
@@ -85,13 +85,6 @@ class ImageEncoderViT(nn.Module):
             window_size (int): Size of attention window for windowed attention blocks.
             global_attn_indexes (Tuple[int, ...]): Indices of blocks that use global attention.
-        Attributes:
-            img_size (int): Dimension of input images.
-            patch_embed (PatchEmbed): Module for patch embedding.
-            pos_embed (nn.Parameter | None): Absolute positional embedding for patches.
-            blocks (nn.ModuleList): List of transformer blocks.
-            neck (nn.Sequential): Neck module for final processing.
         Examples:
             >>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
             >>> input_image = torch.randn(1, 3, 224, 224)
@@ -148,7 +141,7 @@ class ImageEncoderViT(nn.Module):
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Processes input through patch embedding, positional embedding, transformer blocks, and neck module."""
+        """Process input through patch embedding, positional embedding, transformer blocks, and neck module."""
         x = self.patch_embed(x)
         if self.pos_embed is not None:
             pos_embed = (
@@ -201,10 +194,7 @@ class PromptEncoder(nn.Module):
         activation: Type[nn.Module] = nn.GELU,
     ) -> None:
         """
-        Initializes the PromptEncoder module for encoding various types of prompts.
-        This module encodes different types of prompts (points, boxes, masks) for input to SAM's mask decoder,
-        producing both sparse and dense embeddings.
+        Initialize the PromptEncoder module for encoding various types of prompts.
         Args:
             embed_dim (int): The dimension of the embeddings.
@@ -213,17 +203,6 @@ class PromptEncoder(nn.Module):
             mask_in_chans (int): The number of hidden channels used for encoding input masks.
             activation (Type[nn.Module]): The activation function to use when encoding input masks.
-        Attributes:
-            embed_dim (int): Dimension of the embeddings.
-            input_image_size (Tuple[int, int]): Size of the input image as (H, W).
-            image_embedding_size (Tuple[int, int]): Spatial size of the image embedding as (H, W).
-            pe_layer (PositionEmbeddingRandom): Module for random position embedding.
-            num_point_embeddings (int): Number of point embeddings for different types of points.
-            point_embeddings (nn.ModuleList): List of point embeddings.
-            not_a_point_embed (nn.Embedding): Embedding for points that are not part of any label.
-            mask_input_size (Tuple[int, int]): Size of the input mask.
-            mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
         Examples:
             >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
             >>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
@@ -258,9 +237,9 @@ class PromptEncoder(nn.Module):
     def get_dense_pe(self) -> torch.Tensor:
         """
-        Returns the dense positional encoding used for encoding point prompts.
+        Return the dense positional encoding used for encoding point prompts.
-        This method generates a positional encoding for a dense set of points matching the shape of the image
+        Generate a positional encoding for a dense set of points matching the shape of the image
         encoding. The encoding is used to provide spatial information to the model when processing point prompts.
         Returns:
@@ -276,7 +255,7 @@ class PromptEncoder(nn.Module):
         return self.pe_layer(self.image_embedding_size).unsqueeze(0)
     def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor:
-        """Embeds point prompts by applying positional encoding and label-specific embeddings."""
+        """Embed point prompts by applying positional encoding and label-specific embeddings."""
         points = points + 0.5  # Shift to center of pixel
         if pad:
             padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
@@ -293,7 +272,7 @@ class PromptEncoder(nn.Module):
         return point_embedding
     def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
-        """Embeds box prompts by applying positional encoding and adding corner embeddings."""
+        """Embed box prompts by applying positional encoding and adding corner embeddings."""
         boxes = boxes + 0.5  # Shift to center of pixel
         coords = boxes.reshape(-1, 2, 2)
         corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
@@ -302,7 +281,7 @@ class PromptEncoder(nn.Module):
         return corner_embedding
     def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
-        """Embeds mask inputs by downscaling and processing through convolutional layers."""
+        """Embed mask inputs by downscaling and processing through convolutional layers."""
         return self.mask_downscaling(masks)
     @staticmethod
@@ -311,7 +290,7 @@ class PromptEncoder(nn.Module):
         boxes: Optional[torch.Tensor],
         masks: Optional[torch.Tensor],
     ) -> int:
-        """Gets the batch size of the output given the batch size of the input prompts."""
+        """Get the batch size of the output given the batch size of the input prompts."""
         if points is not None:
             return points[0].shape[0]
         elif boxes is not None:
@@ -322,7 +301,7 @@ class PromptEncoder(nn.Module):
             return 1
     def _get_device(self) -> torch.device:
-        """Returns the device of the first point embedding's weight tensor."""
+        """Return the device of the first point embedding's weight tensor."""
         return self.point_embeddings[0].weight.device
     def forward(
@@ -332,7 +311,7 @@ class PromptEncoder(nn.Module):
         masks: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        Embeds different types of prompts, returning both sparse and dense embeddings.
+        Embed different types of prompts, returning both sparse and dense embeddings.
         Args:
             points (Tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
@@ -377,7 +356,7 @@ class PromptEncoder(nn.Module):
 class MemoryEncoder(nn.Module):
     """
-    Encodes pixel features and masks into a memory representation for efficient image segmentation.
+    Encode pixel features and masks into a memory representation for efficient image segmentation.
     This class processes pixel-level features and masks, fusing them to generate encoded memory representations
     suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
@@ -390,7 +369,7 @@ class MemoryEncoder(nn.Module):
         out_proj (nn.Module): Output projection layer, either nn.Identity or nn.Conv2d.
     Methods:
-        forward: Processes input pixel features and masks to generate encoded memory representations.
+        forward: Process input pixel features and masks to generate encoded memory representations.
     Examples:
         >>> import torch
@@ -407,7 +386,7 @@ class MemoryEncoder(nn.Module):
         out_dim,
         in_dim=256,  # in_dim of pix_feats
     ):
-        """Initializes the MemoryEncoder for encoding pixel features and masks into memory representations."""
+        """Initialize the MemoryEncoder for encoding pixel features and masks into memory representations."""
         super().__init__()
         self.mask_downsampler = MaskDownSampler(kernel_size=3, stride=2, padding=1)
@@ -425,7 +404,7 @@ class MemoryEncoder(nn.Module):
         masks: torch.Tensor,
         skip_mask_sigmoid: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Processes pixel features and masks to generate encoded memory representations for segmentation."""
+        """Process pixel features and masks to generate encoded memory representations for segmentation."""
         if not skip_mask_sigmoid:
             masks = F.sigmoid(masks)
         masks = self.mask_downsampler(masks)
@@ -445,7 +424,7 @@ class MemoryEncoder(nn.Module):
 class ImageEncoder(nn.Module):
     """
-    Encodes images using a trunk-neck architecture, producing multiscale features and positional encodings.
+    Encode images using a trunk-neck architecture, producing multiscale features and positional encodings.
     This class combines a trunk network for feature extraction with a neck network for feature refinement
     and positional encoding generation. It can optionally discard the lowest resolution features.
@@ -456,7 +435,7 @@ class ImageEncoder(nn.Module):
         scalp (int): Number of lowest resolution feature levels to discard.
     Methods:
-        forward: Processes the input image through the trunk and neck networks.
+        forward: Process the input image through the trunk and neck networks.
     Examples:
         >>> trunk = SomeTrunkNetwork()
@@ -474,7 +453,7 @@ class ImageEncoder(nn.Module):
         neck: nn.Module,
         scalp: int = 0,
     ):
-        """Initializes the ImageEncoder with trunk and neck networks for feature extraction and refinement."""
+        """Initialize the ImageEncoder with trunk and neck networks for feature extraction and refinement."""
         super().__init__()
         self.trunk = trunk
         self.neck = neck
@@ -484,7 +463,7 @@ class ImageEncoder(nn.Module):
         )
     def forward(self, sample: torch.Tensor):
-        """Encodes input through patch embedding, positional embedding, transformer blocks, and neck module."""
+        """Encode input through patch embedding, positional embedding, transformer blocks, and neck module."""
         features, pos = self.neck(self.trunk(sample))
         if self.scalp > 0:
             # Discard the lowest resolution features
@@ -514,7 +493,7 @@ class FpnNeck(nn.Module):
         fpn_top_down_levels (List[int]): Levels to have top-down features in outputs.
     Methods:
-        forward: Performs forward pass through the FPN neck.
+        forward: Perform forward pass through the FPN neck.
     Examples:
         >>> backbone_channels = [64, 128, 256, 512]
@@ -665,8 +644,8 @@ class Hiera(nn.Module):
         channel_list (List[int]): List of output channel dimensions for each stage.
     Methods:
-        _get_pos_embed: Generates positional embeddings by interpolating and combining window and background embeddings.
-        forward: Performs the forward pass through the Hiera model.
+        _get_pos_embed: Generate positional embeddings by interpolating and combining window and background embeddings.
+        forward: Perform the forward pass through the Hiera model.
     Examples:
         >>> model = Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3))
@@ -702,7 +681,7 @@ class Hiera(nn.Module):
         ),
         return_interm_layers=True,  # return feats from every stage
     ):
-        """Initializes the Hiera model, configuring its hierarchical vision transformer architecture."""
+        """Initialize the Hiera model, configuring its hierarchical vision transformer architecture."""
         super().__init__()
         assert len(stages) == len(window_spec)
@@ -768,7 +747,7 @@ class Hiera(nn.Module):
         )
     def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
-        """Generates positional embeddings by interpolating and combining window and background embeddings."""
+        """Generate positional embeddings by interpolating and combining window and background embeddings."""
         h, w = hw
         window_embed = self.pos_embed_window
         pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
@@ -777,7 +756,7 @@ class Hiera(nn.Module):
         return pos_embed
     def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
-        """Performs forward pass through Hiera model, extracting multiscale features from input images."""
+        """Perform forward pass through Hiera model, extracting multiscale features from input images."""
         x = self.patch_embed(x)
         # x: (B, H, W, C)

ultralytics 8.3.88__py3-none-any.whl → 8.3.90__py3-none-any.whl

ultralytics 8.3.88py3-none-any.whl → 8.3.90py3-none-any.whl