PyPI - ultralytics - Versions diffs - 8.3.142__py3-none-any.whl → 8.3.144__py3-none-any.whl - Mend

ultralytics 8.3.142py3-none-any.whl → 8.3.144py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

tests/conftest.py +7 -24
tests/test_cli.py +1 -1
tests/test_cuda.py +7 -2
tests/test_engine.py +7 -8
tests/test_exports.py +16 -16
tests/test_integrations.py +1 -1
tests/test_solutions.py +12 -12
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +16 -13
ultralytics/data/annotator.py +6 -5
ultralytics/data/augment.py +127 -126
ultralytics/data/base.py +54 -51
ultralytics/data/build.py +47 -23
ultralytics/data/converter.py +47 -43
ultralytics/data/dataset.py +51 -50
ultralytics/data/loaders.py +77 -44
ultralytics/data/split.py +22 -9
ultralytics/data/split_dota.py +63 -39
ultralytics/data/utils.py +59 -39
ultralytics/engine/exporter.py +79 -27
ultralytics/engine/model.py +39 -39
ultralytics/engine/predictor.py +37 -28
ultralytics/engine/results.py +187 -157
ultralytics/engine/trainer.py +36 -19
ultralytics/engine/tuner.py +12 -9
ultralytics/engine/validator.py +7 -9
ultralytics/hub/__init__.py +11 -13
ultralytics/hub/auth.py +22 -2
ultralytics/hub/google/__init__.py +19 -19
ultralytics/hub/session.py +37 -51
ultralytics/hub/utils.py +19 -5
ultralytics/models/fastsam/model.py +30 -12
ultralytics/models/fastsam/predict.py +5 -6
ultralytics/models/fastsam/utils.py +3 -3
ultralytics/models/fastsam/val.py +10 -6
ultralytics/models/nas/model.py +9 -5
ultralytics/models/nas/predict.py +6 -6
ultralytics/models/nas/val.py +3 -3
ultralytics/models/rtdetr/model.py +7 -6
ultralytics/models/rtdetr/predict.py +14 -7
ultralytics/models/rtdetr/train.py +10 -4
ultralytics/models/rtdetr/val.py +36 -9
ultralytics/models/sam/amg.py +30 -12
ultralytics/models/sam/build.py +22 -22
ultralytics/models/sam/model.py +10 -9
ultralytics/models/sam/modules/blocks.py +76 -80
ultralytics/models/sam/modules/decoders.py +6 -8
ultralytics/models/sam/modules/encoders.py +23 -26
ultralytics/models/sam/modules/memory_attention.py +13 -1
ultralytics/models/sam/modules/sam.py +57 -26
ultralytics/models/sam/modules/tiny_encoder.py +232 -237
ultralytics/models/sam/modules/transformer.py +13 -13
ultralytics/models/sam/modules/utils.py +11 -19
ultralytics/models/sam/predict.py +114 -101
ultralytics/models/utils/loss.py +98 -77
ultralytics/models/utils/ops.py +116 -67
ultralytics/models/yolo/classify/predict.py +5 -5
ultralytics/models/yolo/classify/train.py +32 -28
ultralytics/models/yolo/classify/val.py +7 -8
ultralytics/models/yolo/detect/predict.py +1 -0
ultralytics/models/yolo/detect/train.py +15 -14
ultralytics/models/yolo/detect/val.py +37 -36
ultralytics/models/yolo/model.py +106 -23
ultralytics/models/yolo/obb/predict.py +3 -4
ultralytics/models/yolo/obb/train.py +14 -6
ultralytics/models/yolo/obb/val.py +29 -23
ultralytics/models/yolo/pose/predict.py +9 -8
ultralytics/models/yolo/pose/train.py +24 -16
ultralytics/models/yolo/pose/val.py +44 -26
ultralytics/models/yolo/segment/predict.py +5 -5
ultralytics/models/yolo/segment/train.py +11 -7
ultralytics/models/yolo/segment/val.py +2 -2
ultralytics/models/yolo/world/train.py +33 -23
ultralytics/models/yolo/world/train_world.py +11 -3
ultralytics/models/yolo/yoloe/predict.py +11 -11
ultralytics/models/yolo/yoloe/train.py +73 -21
ultralytics/models/yolo/yoloe/train_seg.py +10 -7
ultralytics/models/yolo/yoloe/val.py +42 -18
ultralytics/nn/autobackend.py +59 -15
ultralytics/nn/modules/__init__.py +4 -4
ultralytics/nn/modules/activation.py +4 -1
ultralytics/nn/modules/block.py +178 -111
ultralytics/nn/modules/conv.py +6 -5
ultralytics/nn/modules/head.py +469 -121
ultralytics/nn/modules/transformer.py +147 -58
ultralytics/nn/tasks.py +227 -20
ultralytics/nn/text_model.py +30 -33
ultralytics/solutions/ai_gym.py +1 -1
ultralytics/solutions/analytics.py +7 -4
ultralytics/solutions/config.py +10 -10
ultralytics/solutions/distance_calculation.py +11 -10
ultralytics/solutions/heatmap.py +1 -1
ultralytics/solutions/instance_segmentation.py +6 -3
ultralytics/solutions/object_blurrer.py +3 -3
ultralytics/solutions/object_counter.py +16 -8
ultralytics/solutions/object_cropper.py +12 -5
ultralytics/solutions/parking_management.py +29 -28
ultralytics/solutions/queue_management.py +6 -6
ultralytics/solutions/region_counter.py +10 -3
ultralytics/solutions/security_alarm.py +3 -3
ultralytics/solutions/similarity_search.py +85 -24
ultralytics/solutions/solutions.py +215 -85
ultralytics/solutions/speed_estimation.py +28 -22
ultralytics/solutions/streamlit_inference.py +17 -12
ultralytics/solutions/trackzone.py +4 -4
ultralytics/trackers/basetrack.py +16 -23
ultralytics/trackers/bot_sort.py +30 -20
ultralytics/trackers/byte_tracker.py +70 -64
ultralytics/trackers/track.py +4 -8
ultralytics/trackers/utils/gmc.py +31 -58
ultralytics/trackers/utils/kalman_filter.py +37 -37
ultralytics/trackers/utils/matching.py +1 -1
ultralytics/utils/__init__.py +105 -89
ultralytics/utils/autobatch.py +16 -3
ultralytics/utils/autodevice.py +54 -24
ultralytics/utils/benchmarks.py +42 -28
ultralytics/utils/callbacks/base.py +3 -3
ultralytics/utils/callbacks/clearml.py +9 -9
ultralytics/utils/callbacks/comet.py +67 -25
ultralytics/utils/callbacks/dvc.py +7 -10
ultralytics/utils/callbacks/mlflow.py +2 -5
ultralytics/utils/callbacks/neptune.py +7 -13
ultralytics/utils/callbacks/raytune.py +1 -1
ultralytics/utils/callbacks/tensorboard.py +5 -6
ultralytics/utils/callbacks/wb.py +14 -14
ultralytics/utils/checks.py +14 -13
ultralytics/utils/dist.py +5 -5
ultralytics/utils/downloads.py +94 -67
ultralytics/utils/errors.py +5 -5
ultralytics/utils/export.py +61 -47
ultralytics/utils/files.py +23 -22
ultralytics/utils/instance.py +48 -52
ultralytics/utils/loss.py +78 -40
ultralytics/utils/metrics.py +186 -130
ultralytics/utils/ops.py +186 -190
ultralytics/utils/patches.py +15 -17
ultralytics/utils/plotting.py +71 -27
ultralytics/utils/tal.py +21 -15
ultralytics/utils/torch_utils.py +53 -50
ultralytics/utils/triton.py +5 -4
ultralytics/utils/tuner.py +5 -5
{ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/METADATA +1 -1
ultralytics-8.3.144.dist-info/RECORD +272 -0
ultralytics-8.3.142.dist-info/RECORD +0 -272
{ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/WHEEL +0 -0
{ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/entry_points.txt +0 -0
{ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/licenses/LICENSE +0 -0
{ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -47,26 +47,26 @@ class Predictor(BasePredictor):
         device (torch.device): The device (CPU or GPU) on which the model is loaded.
         im (torch.Tensor): The preprocessed input image.
         features (torch.Tensor): Extracted image features.
-        prompts (dict): Dictionary to store various types of prompts (e.g., bboxes, points, masks).
+        prompts (Dict[str, Any]): Dictionary to store various types of prompts (e.g., bboxes, points, masks).
         segment_all (bool): Flag to indicate if full image segmentation should be performed.
         mean (torch.Tensor): Mean values for image normalization.
         std (torch.Tensor): Standard deviation values for image normalization.
     Methods:
-        preprocess: Prepares input images for model inference.
-        pre_transform: Performs initial transformations on the input image.
-        inference: Performs segmentation inference based on input prompts.
+        preprocess: Prepare input images for model inference.
+        pre_transform: Perform initial transformations on the input image.
+        inference: Perform segmentation inference based on input prompts.
         prompt_inference: Internal function for prompt-based segmentation inference.
-        generate: Generates segmentation masks for an entire image.
-        setup_model: Initializes the SAM model for inference.
-        get_model: Builds and returns a SAM model.
-        postprocess: Post-processes model outputs to generate final results.
-        setup_source: Sets up the data source for inference.
-        set_image: Sets and preprocesses a single image for inference.
-        get_im_features: Extracts image features using the SAM image encoder.
-        set_prompts: Sets prompts for subsequent inference.
-        reset_image: Resets the current image and its features.
-        remove_small_regions: Removes small disconnected regions and holes from masks.
+        generate: Generate segmentation masks for an entire image.
+        setup_model: Initialize the SAM model for inference.
+        get_model: Build and return a SAM model.
+        postprocess: Post-process model outputs to generate final results.
+        setup_source: Set up the data source for inference.
+        set_image: Set and preprocess a single image for inference.
+        get_im_features: Extract image features using the SAM image encoder.
+        set_prompts: Set prompts for subsequent inference.
+        reset_image: Reset the current image and its features.
+        remove_small_regions: Remove small disconnected regions and holes from masks.
     Examples:
         >>> predictor = Predictor()
@@ -86,8 +86,8 @@ class Predictor(BasePredictor):
         Args:
             cfg (dict): Configuration dictionary containing default settings.
-            overrides (Dict | None): Dictionary of values to override default configuration.
-            _callbacks (Dict | None): Dictionary of callback functions to customize behavior.
+            overrides (dict | None): Dictionary of values to override default configuration.
+            _callbacks (dict | None): Dictionary of callback functions to customize behavior.
         Examples:
             >>> predictor_example = Predictor(cfg=DEFAULT_CFG)
@@ -115,7 +115,7 @@ class Predictor(BasePredictor):
             im (torch.Tensor | List[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC numpy arrays.
         Returns:
-            im (torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
+            (torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
         Examples:
             >>> predictor = Predictor()
@@ -182,9 +182,9 @@ class Predictor(BasePredictor):
             **kwargs (Any): Additional keyword arguments.
         Returns:
-            (np.ndarray): The output masks in shape (C, H, W), where C is the number of generated masks.
-            (np.ndarray): An array of length C containing quality scores predicted by the model for each mask.
-            (np.ndarray): Low-resolution logits of shape (C, H, W) for subsequent inference, where H=W=256.
+            pred_masks (np.ndarray): The output masks in shape (C, H, W), where C is the number of generated masks.
+            pred_scores (np.ndarray): An array of length C containing quality scores predicted by the model for each mask.
+            pred_logits (np.ndarray): Low-resolution logits of shape (C, H, W) for subsequent inference, where H=W=256.
         Examples:
             >>> predictor = Predictor()
@@ -205,7 +205,7 @@ class Predictor(BasePredictor):
     def prompt_inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False):
         """
-        Performs image segmentation inference based on input cues using SAM's specialized architecture.
+        Perform image segmentation inference based on input cues using SAM's specialized architecture.
         This internal function leverages the Segment Anything Model (SAM) for prompt-based, real-time segmentation.
         It processes various input prompts such as bounding boxes, points, and masks to generate segmentation masks.
@@ -218,12 +218,9 @@ class Predictor(BasePredictor):
             masks (np.ndarray | None): Low-res masks from previous predictions with shape (N, H, W). For SAM, H=W=256.
             multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
-        Raises:
-            AssertionError: If the number of points don't match the number of labels, in case labels were passed.
         Returns:
-            (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
-            (np.ndarray): Quality scores predicted by the model for each mask, with length C.
+            pred_masks (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
+            pred_scores (np.ndarray): Quality scores predicted by the model for each mask, with length C.
         Examples:
             >>> predictor = Predictor()
@@ -253,20 +250,23 @@ class Predictor(BasePredictor):
     def _prepare_prompts(self, dst_shape, bboxes=None, points=None, labels=None, masks=None):
         """
-        Prepares and transforms the input prompts for processing based on the destination shape.
+        Prepare and transform the input prompts for processing based on the destination shape.
         Args:
             dst_shape (tuple): The target shape (height, width) for the prompts.
             bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4).
             points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
             labels (np.ndarray | List | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
-            masks (List | np.ndarray, Optional): Masks for the objects, where each mask is a 2D array.
+            masks (List | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
+        Returns:
+            bboxes (torch.Tensor | None): Transformed bounding boxes.
+            points (torch.Tensor | None): Transformed points.
+            labels (torch.Tensor | None): Transformed labels.
+            masks (torch.Tensor | None): Transformed masks.
         Raises:
             AssertionError: If the number of points don't match the number of labels, in case labels were passed.
-        Returns:
-            (tuple): A tuple containing transformed bounding boxes, points, labels, and masks.
         """
         src_shape = self.batch[1][0].shape[:2]
         r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
@@ -407,7 +407,7 @@ class Predictor(BasePredictor):
     def setup_model(self, model=None, verbose=True):
         """
-        Initializes the Segment Anything Model (SAM) for inference.
+        Initialize the Segment Anything Model (SAM) for inference.
         This method sets up the SAM model by allocating it to the appropriate device and initializing the necessary
         parameters for image normalization and other Ultralytics compatibility settings.
@@ -437,20 +437,20 @@ class Predictor(BasePredictor):
         self.done_warmup = True
     def get_model(self):
-        """Retrieves or builds the Segment Anything Model (SAM) for image segmentation tasks."""
+        """Retrieve or build the Segment Anything Model (SAM) for image segmentation tasks."""
         from .build import build_sam  # slow import
         return build_sam(self.args.model)
     def postprocess(self, preds, img, orig_imgs):
         """
-        Post-processes SAM's inference outputs to generate object detection masks and bounding boxes.
+        Post-process SAM's inference outputs to generate object detection masks and bounding boxes.
         This method scales masks and boxes to the original image size and applies a threshold to the mask
         predictions. It leverages SAM's advanced architecture for real-time, promptable segmentation tasks.
         Args:
-            preds (Tuple[torch.Tensor]): The output from SAM model inference, containing:
+            preds (tuple): The output from SAM model inference, containing:
                 - pred_masks (torch.Tensor): Predicted masks with shape (N, 1, H, W).
                 - pred_scores (torch.Tensor): Confidence scores for each mask with shape (N, 1).
                 - pred_bboxes (torch.Tensor, optional): Predicted bounding boxes if segment_all is True.
@@ -458,7 +458,7 @@ class Predictor(BasePredictor):
             orig_imgs (List[np.ndarray] | torch.Tensor): The original, unprocessed images.
         Returns:
-            results (List[Results]): List of Results objects containing detection masks, bounding boxes, and other
+            (List[Results]): List of Results objects containing detection masks, bounding boxes, and other
                 metadata for each processed image.
         Examples:
@@ -495,7 +495,7 @@ class Predictor(BasePredictor):
     def setup_source(self, source):
         """
-        Sets up the data source for inference.
+        Set up the data source for inference.
         This method configures the data source from which images will be fetched for inference. It supports
         various input types such as image files, directories, video files, and other compatible data sources.
@@ -520,7 +520,7 @@ class Predictor(BasePredictor):
     def set_image(self, image):
         """
-        Preprocesses and sets a single image for inference.
+        Preprocess and set a single image for inference.
         This method prepares the model for inference on a single image by setting up the model if not already
         initialized, configuring the data source, and preprocessing the image for feature extraction. It
@@ -530,14 +530,14 @@ class Predictor(BasePredictor):
             image (str | np.ndarray): Path to the image file as a string, or a numpy array representing
                 an image read by cv2.
-        Raises:
-            AssertionError: If more than one image is attempted to be set.
         Examples:
             >>> predictor = Predictor()
             >>> predictor.set_image("path/to/image.jpg")
             >>> predictor.set_image(cv2.imread("path/to/image.jpg"))
+        Raises:
+            AssertionError: If more than one image is attempted to be set.
         Notes:
             - This method should be called before performing inference on a new image.
             - The extracted features are stored in the `self.features` attribute for later use.
@@ -552,7 +552,7 @@ class Predictor(BasePredictor):
             break
     def get_im_features(self, im):
-        """Extracts image features using the SAM model's image encoder for subsequent mask prediction."""
+        """Extract image features using the SAM model's image encoder for subsequent mask prediction."""
         assert isinstance(self.imgsz, (tuple, list)) and self.imgsz[0] == self.imgsz[1], (
             f"SAM models only support square image size, but got {self.imgsz}."
         )
@@ -560,11 +560,11 @@ class Predictor(BasePredictor):
         return self.model.image_encoder(im)
     def set_prompts(self, prompts):
-        """Sets prompts for subsequent inference operations."""
+        """Set prompts for subsequent inference operations."""
         self.prompts = prompts
     def reset_image(self):
-        """Resets the current image and its features, clearing them for subsequent inference."""
+        """Reset the current image and its features, clearing them for subsequent inference."""
         self.im = None
         self.features = None
@@ -630,18 +630,18 @@ class SAM2Predictor(Predictor):
     prompt-based inference.
     Attributes:
-        _bb_feat_sizes (List[Tuple[int, int]]): Feature sizes for different backbone levels.
+        _bb_feat_sizes (List[tuple]): Feature sizes for different backbone levels.
         model (torch.nn.Module): The loaded SAM2 model.
         device (torch.device): The device (CPU or GPU) on which the model is loaded.
-        features (Dict[str, torch.Tensor]): Cached image features for efficient inference.
+        features (dict): Cached image features for efficient inference.
         segment_all (bool): Flag to indicate if all segments should be predicted.
-        prompts (dict): Dictionary to store various types of prompts for inference.
+        prompts (Dict[str, Any]): Dictionary to store various types of prompts for inference.
     Methods:
-        get_model: Retrieves and initializes the SAM2 model.
-        prompt_inference: Performs image segmentation inference based on various prompts.
-        set_image: Preprocesses and sets a single image for inference.
-        get_im_features: Extracts and processes image features using SAM2's image encoder.
+        get_model: Retrieve and initialize the SAM2 model.
+        prompt_inference: Perform image segmentation inference based on various prompts.
+        set_image: Preprocess and set a single image for inference.
+        get_im_features: Extract and process image features using SAM2's image encoder.
     Examples:
         >>> predictor = SAM2Predictor(cfg)
@@ -658,7 +658,7 @@ class SAM2Predictor(Predictor):
     ]
     def get_model(self):
-        """Retrieves and initializes the Segment Anything Model 2 (SAM2) for image segmentation tasks."""
+        """Retrieve and initialize the Segment Anything Model 2 (SAM2) for image segmentation tasks."""
         from .build import build_sam  # slow import
         return build_sam(self.args.model)
@@ -674,7 +674,7 @@ class SAM2Predictor(Predictor):
         img_idx=-1,
     ):
         """
-        Performs image segmentation inference based on various prompts using SAM2 architecture.
+        Perform image segmentation inference based on various prompts using SAM2 architecture.
         This method leverages the Segment Anything Model 2 (SAM2) to generate segmentation masks for input images
         based on provided prompts such as bounding boxes, points, or existing masks. It supports both single and
@@ -690,8 +690,8 @@ class SAM2Predictor(Predictor):
             img_idx (int): Index of the image in the batch to process.
         Returns:
-            (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
-            (np.ndarray): Quality scores for each mask, with length C.
+            pred_masks (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
+            pred_scores (np.ndarray): Quality scores for each mask, with length C.
         Examples:
             >>> predictor = SAM2Predictor(cfg)
@@ -733,20 +733,22 @@ class SAM2Predictor(Predictor):
     def _prepare_prompts(self, dst_shape, bboxes=None, points=None, labels=None, masks=None):
         """
-        Prepares and transforms the input prompts for processing based on the destination shape.
+        Prepare and transform the input prompts for processing based on the destination shape.
         Args:
             dst_shape (tuple): The target shape (height, width) for the prompts.
             bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4).
             points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
             labels (np.ndarray | List | None): Point prompt labels with shape (N,) or (N, num_points). 1 for foreground, 0 for background.
-            masks (List | np.ndarray, Optional): Masks for the objects, where each mask is a 2D array.
+            masks (List | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
+        Returns:
+            points (torch.Tensor | None): Transformed points.
+            labels (torch.Tensor | None): Transformed labels.
+            masks (torch.Tensor | None): Transformed masks.
         Raises:
             AssertionError: If the number of points don't match the number of labels, in case labels were passed.
-        Returns:
-            (tuple): A tuple containing transformed points, labels, and masks.
         """
         bboxes, points, labels, masks = super()._prepare_prompts(dst_shape, bboxes, points, labels, masks)
         if bboxes is not None:
@@ -763,7 +765,7 @@ class SAM2Predictor(Predictor):
     def set_image(self, image):
         """
-        Preprocesses and sets a single image for inference using the SAM2 model.
+        Preprocess and set a single image for inference using the SAM2 model.
         This method initializes the model if not already done, configures the data source to the specified image,
         and preprocesses the image for feature extraction. It supports setting only one image at a time.
@@ -771,14 +773,14 @@ class SAM2Predictor(Predictor):
         Args:
             image (str | np.ndarray): Path to the image file as a string, or a numpy array representing the image.
-        Raises:
-            AssertionError: If more than one image is attempted to be set.
         Examples:
             >>> predictor = SAM2Predictor()
             >>> predictor.set_image("path/to/image.jpg")
             >>> predictor.set_image(np.array([...]))  # Using a numpy array
+        Raises:
+            AssertionError: If more than one image is attempted to be set.
         Notes:
             - This method must be called before performing any inference on a new image.
             - The method caches the extracted features for efficient subsequent inferences on the same image.
@@ -794,7 +796,7 @@ class SAM2Predictor(Predictor):
             break
     def get_im_features(self, im):
-        """Extracts image features from the SAM image encoder for subsequent processing."""
+        """Extract image features from the SAM image encoder for subsequent processing."""
         assert isinstance(self.imgsz, (tuple, list)) and self.imgsz[0] == self.imgsz[1], (
             f"SAM 2 models only support square image size, but got {self.imgsz}."
         )
@@ -827,10 +829,20 @@ class SAM2VideoPredictor(SAM2Predictor):
         clear_non_cond_mem_for_multi_obj (bool): A flag to control clearing non-conditional memory for multi-object scenarios.
         callbacks (dict): A dictionary of callbacks for various prediction lifecycle events.
-    Args:
-        cfg (dict, Optional): Configuration settings for the predictor. Defaults to DEFAULT_CFG.
-        overrides (dict, Optional): Additional configuration overrides. Defaults to None.
-        _callbacks (list, Optional): Custom callbacks to be added. Defaults to None.
+    Methods:
+        get_model: Retrieve and configure the model with binarization enabled.
+        inference: Perform image segmentation inference based on the given input cues.
+        postprocess: Post-process the predictions to apply non-overlapping constraints if required.
+        add_new_prompts: Add new points or masks to a specific frame for a given object ID.
+        propagate_in_video_preflight: Prepare inference_state and consolidate temporary outputs before tracking.
+        init_state: Initialize an inference state for the predictor.
+        get_im_features: Extract and process image features using SAM2's image encoder for subsequent segmentation tasks.
+    Examples:
+        >>> predictor = SAM2VideoPredictor(cfg=DEFAULT_CFG)
+        >>> predictor.set_image("path/to/video_frame.jpg")
+        >>> bboxes = [[100, 100, 200, 200]]
+        >>> results = predictor(bboxes=bboxes)
     Note:
         The `fill_hole_area` attribute is defined but not used in the current implementation.
@@ -848,8 +860,8 @@ class SAM2VideoPredictor(SAM2Predictor):
         Args:
             cfg (dict): Configuration dictionary containing default settings.
-            overrides (Dict | None): Dictionary of values to override default configuration.
-            _callbacks (Dict | None): Dictionary of callback functions to customize behavior.
+            overrides (dict | None): Dictionary of values to override default configuration.
+            _callbacks (dict | None): Dictionary of callback functions to customize behavior.
         Examples:
             >>> predictor = SAM2VideoPredictor(cfg=DEFAULT_CFG)
@@ -865,7 +877,7 @@ class SAM2VideoPredictor(SAM2Predictor):
     def get_model(self):
         """
-        Retrieves and configures the model with binarization enabled.
+        Retrieve and configure the model with binarization enabled.
         Note:
             This method overrides the base class implementation to set the binarize flag to True.
@@ -888,8 +900,8 @@ class SAM2VideoPredictor(SAM2Predictor):
             masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
         Returns:
-            (np.ndarray): The output masks in shape CxHxW, where C is the number of generated masks.
-            (np.ndarray): An array of length C containing quality scores predicted by the model for each mask.
+            pred_masks (np.ndarray): The output masks in shape CxHxW, where C is the number of generated masks.
+            pred_scores (np.ndarray): An array of length C containing quality scores predicted by the model for each mask.
         """
         # Override prompts if any stored in self.prompts
         bboxes = self.prompts.pop("bboxes", bboxes)
@@ -947,19 +959,19 @@ class SAM2VideoPredictor(SAM2Predictor):
     def postprocess(self, preds, img, orig_imgs):
         """
-        Post-processes the predictions to apply non-overlapping constraints if required.
+        Post-process the predictions to apply non-overlapping constraints if required.
         This method extends the post-processing functionality by applying non-overlapping constraints
         to the predicted masks if the `non_overlap_masks` flag is set to True. This ensures that
         the masks do not overlap, which can be useful for certain applications.
         Args:
-            preds (Tuple[torch.Tensor]): The predictions from the model.
+            preds (tuple): The predictions from the model.
             img (torch.Tensor): The processed image tensor.
             orig_imgs (List[np.ndarray]): The original images before processing.
         Returns:
-            results (list): The post-processed predictions.
+            (list): The post-processed predictions.
         Note:
             If `non_overlap_masks` is True, the method applies constraints to ensure non-overlapping masks.
@@ -982,7 +994,7 @@ class SAM2VideoPredictor(SAM2Predictor):
         frame_idx=0,
     ):
         """
-        Adds new points or masks to a specific frame for a given object ID.
+        Add new points or masks to a specific frame for a given object ID.
         This method updates the inference state with new prompts (points or masks) for a specified
         object and frame index. It ensures that the prompts are either points or masks, but not both,
@@ -991,13 +1003,14 @@ class SAM2VideoPredictor(SAM2Predictor):
         Args:
             obj_id (int): The ID of the object to which the prompts are associated.
-            points (torch.Tensor, Optional): The coordinates of the points of interest. Defaults to None.
-            labels (torch.Tensor, Optional): The labels corresponding to the points. Defaults to None.
-            masks (torch.Tensor, optional): Binary masks for the object. Defaults to None.
-            frame_idx (int, optional): The index of the frame to which the prompts are applied. Defaults to 0.
+            points (torch.Tensor, optional): The coordinates of the points of interest.
+            labels (torch.Tensor, optional): The labels corresponding to the points.
+            masks (torch.Tensor, optional): Binary masks for the object.
+            frame_idx (int, optional): The index of the frame to which the prompts are applied.
         Returns:
-            (tuple): A tuple containing the flattened predicted masks and a tensor of ones indicating the number of objects.
+            pred_masks (torch.Tensor): The flattened predicted masks.
+            pred_scores (torch.Tensor): A tensor of ones indicating the number of objects.
         Raises:
             AssertionError: If both `masks` and `points` are provided, or neither is provided.
@@ -1194,16 +1207,16 @@ class SAM2VideoPredictor(SAM2Predictor):
     def get_im_features(self, im, batch=1):
         """
-        Extracts and processes image features using SAM2's image encoder for subsequent segmentation tasks.
+        Extract and process image features using SAM2's image encoder for subsequent segmentation tasks.
         Args:
             im (torch.Tensor): The input image tensor.
-            batch (int, optional): The batch size for expanding features if there are multiple prompts. Defaults to 1.
+            batch (int, optional): The batch size for expanding features if there are multiple prompts.
         Returns:
             vis_feats (torch.Tensor): The visual features extracted from the image.
             vis_pos_embed (torch.Tensor): The positional embeddings for the visual features.
-            feat_sizes (List(Tuple[int])): A list containing the sizes of the extracted features.
+            feat_sizes (List[tuple]): A list containing the sizes of the extracted features.
         Note:
             - If `batch` is greater than 1, the features are expanded to fit the batch size.
@@ -1227,7 +1240,7 @@ class SAM2VideoPredictor(SAM2Predictor):
             obj_id (int): The unique identifier of the object provided by the client side.
         Returns:
-            obj_idx (int): The index of the object on the model side.
+            (int): The index of the object on the model side.
         Raises:
             RuntimeError: If an attempt is made to add a new object after tracking has started.
@@ -1291,14 +1304,14 @@ class SAM2VideoPredictor(SAM2Predictor):
             frame_idx (int): The index of the current frame.
             batch_size (int): The batch size for processing the frame.
             is_init_cond_frame (bool): Indicates if the current frame is an initial conditioning frame.
-            point_inputs (dict, Optional): Input points and their labels. Defaults to None.
-            mask_inputs (torch.Tensor, Optional): Input binary masks. Defaults to None.
+            point_inputs (dict | None): Input points and their labels.
+            mask_inputs (torch.Tensor | None): Input binary masks.
             reverse (bool): Indicates if the tracking should be performed in reverse order.
             run_mem_encoder (bool): Indicates if the memory encoder should be executed.
-            prev_sam_mask_logits (torch.Tensor, Optional): Previous mask logits for the current object. Defaults to None.
+            prev_sam_mask_logits (torch.Tensor | None): Previous mask logits for the current object.
         Returns:
-            current_out (dict): A dictionary containing the output of the tracking step, including updated features and predictions.
+            (dict): A dictionary containing the output of the tracking step, including updated features and predictions.
         Raises:
             AssertionError: If both `point_inputs` and `mask_inputs` are provided, or neither is provided.
@@ -1348,7 +1361,7 @@ class SAM2VideoPredictor(SAM2Predictor):
     def _get_maskmem_pos_enc(self, out_maskmem_pos_enc):
         """
-        Caches and manages the positional encoding for mask memory across frames and objects.
+        Cache and manage the positional encoding for mask memory across frames and objects.
         This method optimizes storage by caching the positional encoding (`maskmem_pos_enc`) for
         mask memory, which is constant across frames and objects, thus reducing the amount of
@@ -1358,11 +1371,11 @@ class SAM2VideoPredictor(SAM2Predictor):
         the current batch size.
         Args:
-            out_maskmem_pos_enc (List[torch.Tensor] or None): The positional encoding for mask memory.
+            out_maskmem_pos_enc (List[torch.Tensor] | None): The positional encoding for mask memory.
                 Should be a list of tensors or None.
         Returns:
-            out_maskmem_pos_enc (List[torch.Tensor]): The positional encoding for mask memory, either cached or expanded.
+            (List[torch.Tensor]): The positional encoding for mask memory, either cached or expanded.
         Note:
             - The method assumes that `out_maskmem_pos_enc` is a list of tensors or None.
@@ -1393,7 +1406,7 @@ class SAM2VideoPredictor(SAM2Predictor):
         run_mem_encoder=False,
     ):
         """
-        Consolidates per-object temporary outputs into a single output for all objects.
+        Consolidate per-object temporary outputs into a single output for all objects.
         This method combines the temporary outputs for each object on a given frame into a unified
         output. It fills in any missing objects either from the main output dictionary or leaves
@@ -1402,13 +1415,12 @@ class SAM2VideoPredictor(SAM2Predictor):
         Args:
             frame_idx (int): The index of the frame for which to consolidate outputs.
-            is_cond (bool, Optional): Indicates if the frame is considered a conditioning frame.
-                Defaults to False.
-            run_mem_encoder (bool, Optional): Specifies whether to run the memory encoder after
-                consolidating the outputs. Defaults to False.
+            is_cond (bool, optional): Indicates if the frame is considered a conditioning frame.
+            run_mem_encoder (bool, optional): Specifies whether to run the memory encoder after
+                consolidating the outputs.
         Returns:
-            consolidated_out (dict): A consolidated output dictionary containing the combined results for all objects.
+            (dict): A consolidated output dictionary containing the combined results for all objects.
         Note:
             - The method initializes the consolidated output with placeholder values for missing objects.
@@ -1538,7 +1550,8 @@ class SAM2VideoPredictor(SAM2Predictor):
             is_mask_from_pts (bool): Indicates if the mask is derived from point interactions.
         Returns:
-            (tuple[torch.Tensor, torch.Tensor]): A tuple containing the encoded mask features and positional encoding.
+            maskmem_features (torch.Tensor): The encoded mask features.
+            maskmem_pos_enc (torch.Tensor): The positional encoding.
         """
         # Retrieve correct image features
         current_vision_feats, _, feat_sizes = self.get_im_features(self.inference_state["im"], batch_size)

ultralytics 8.3.142__py3-none-any.whl → 8.3.144__py3-none-any.whl

ultralytics 8.3.142py3-none-any.whl → 8.3.144py3-none-any.whl