PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.179__tar.gz → 8.3.181__tar.gz - Mend

dgenerate-ultralytics-headless 8.3.179tar.gz → 8.3.181tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (278) hide show

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dgenerate-ultralytics-headless
-Version: 8.3.179
+Version: 8.3.181
 Summary: Automatically built Ultralytics package with python-opencv-headless dependency instead of python-opencv
 Author-email: Glenn Jocher <glenn.jocher@ultralytics.com>, Jing Qiu <jing.qiu@ultralytics.com>
 Maintainer-email: Ultralytics <hello@ultralytics.com>

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/dgenerate_ultralytics_headless.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dgenerate-ultralytics-headless
-Version: 8.3.179
+Version: 8.3.181
 Summary: Automatically built Ultralytics package with python-opencv-headless dependency instead of python-opencv
 Author-email: Glenn Jocher <glenn.jocher@ultralytics.com>, Jing Qiu <jing.qiu@ultralytics.com>
 Maintainer-email: Ultralytics <hello@ultralytics.com>

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-__version__ = "8.3.179"
+__version__ = "8.3.181"
 import os

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/cfg/datasets/VOC.yaml RENAMED Viewed

@@ -87,7 +87,7 @@ download: |
       f"{url}VOCtest_06-Nov-2007.zip",  # 438MB, 4953 images
       f"{url}VOCtrainval_11-May-2012.zip",  # 1.95GB, 17126 images
   ]
-  download(urls, dir=dir / "images", curl=True, threads=3, exist_ok=True)  # download and unzip over existing (required)
+  download(urls, dir=dir / "images", threads=3, exist_ok=True)  # download and unzip over existing (required)
   # Convert
   path = dir / "images/VOCdevkit"

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/cfg/datasets/VisDrone.yaml RENAMED Viewed

@@ -78,7 +78,7 @@ download: |
       "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-test-dev.zip",
       # "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-test-challenge.zip",
   ]
-  download(urls, dir=dir, curl=True, threads=4)
+  download(urls, dir=dir, threads=4)
   # Convert
   splits = {"VisDrone2019-DET-train": "train", "VisDrone2019-DET-val": "val", "VisDrone2019-DET-test-dev": "test"}

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/data/loaders.py RENAMED Viewed

@@ -355,9 +355,10 @@ class LoadImagesAndVideos:
             channels (int): Number of image channels (1 for grayscale, 3 for RGB).
         """
         parent = None
-        if isinstance(path, str) and Path(path).suffix == ".txt":  # *.txt file with img/vid/dir on each line
-            parent = Path(path).parent
-            path = Path(path).read_text().splitlines()  # list of sources
+        if isinstance(path, str) and Path(path).suffix in {".txt", ".csv"}:  # txt/csv file with source paths
+            parent, content = Path(path).parent, Path(path).read_text()
+            path = content.splitlines() if Path(path).suffix == ".txt" else content.split(",")  # list of sources
+            path = [p.strip() for p in path]
         files = []
         for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
             a = str(Path(p).absolute())  # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/data/utils.py RENAMED Viewed

@@ -219,9 +219,7 @@ def verify_image_label(args: Tuple) -> List:
                 assert lb.min() >= -0.01, f"negative class labels {lb[lb < -0.01]}"
                 # All labels
-                if single_cls:
-                    lb[:, 0] = 0
-                max_cls = lb[:, 0].max()  # max label count
+                max_cls = 0 if single_cls else lb[:, 0].max()  # max label count
                 assert max_cls < num_cls, (
                     f"Label class {int(max_cls)} exceeds dataset class count {num_cls}. "
                     f"Possible class labels are 0-{num_cls - 1}"

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/models/sam/modules/blocks.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import copy
 import math
 from functools import partial
-from typing import Any, Optional, Tuple, Type, Union
+from typing import Optional, Tuple, Type, Union
 import numpy as np
 import torch
@@ -856,8 +856,11 @@ class PositionEmbeddingRandom(nn.Module):
     def forward(self, size: Tuple[int, int]) -> torch.Tensor:
         """Generate positional encoding for a grid using random spatial frequencies."""
         h, w = size
-        device: Any = self.positional_encoding_gaussian_matrix.device
-        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        grid = torch.ones(
+            (h, w),
+            device=self.positional_encoding_gaussian_matrix.device,
+            dtype=self.positional_encoding_gaussian_matrix.dtype,
+        )
         y_embed = grid.cumsum(dim=0) - 0.5
         x_embed = grid.cumsum(dim=1) - 0.5
         y_embed = y_embed / h
@@ -871,7 +874,7 @@ class PositionEmbeddingRandom(nn.Module):
         coords = coords_input.clone()
         coords[:, :, 0] = coords[:, :, 0] / image_size[1]
         coords[:, :, 1] = coords[:, :, 1] / image_size[0]
-        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+        return self._pe_encoding(coords)  # B x N x C
 class Block(nn.Module):

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/models/sam/modules/decoders.py RENAMED Viewed

@@ -423,7 +423,7 @@ class SAM2MaskDecoder(nn.Module):
         # Upscale mask embeddings and predict masks using the mask tokens
         src = src.transpose(1, 2).view(b, c, h, w)
-        if not self.use_high_res_features:
+        if not self.use_high_res_features or high_res_features is None:
             upscaled_embedding = self.output_upscaling(src)
         else:
             dc1, ln1, act1, dc2, act2 = self.output_upscaling

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/models/sam/modules/encoders.py RENAMED Viewed

@@ -258,8 +258,8 @@ class PromptEncoder(nn.Module):
         """Embed point prompts by applying positional encoding and label-specific embeddings."""
         points = points + 0.5  # Shift to center of pixel
         if pad:
-            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
-            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            padding_point = torch.zeros((points.shape[0], 1, 2), dtype=points.dtype, device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), dtype=labels.dtype, device=labels.device)
             points = torch.cat([points, padding_point], dim=1)
             labels = torch.cat([labels, padding_label], dim=1)
         point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
@@ -300,10 +300,6 @@ class PromptEncoder(nn.Module):
         else:
             return 1
-    def _get_device(self) -> torch.device:
-        """Return the device of the first point embedding's weight tensor."""
-        return self.point_embeddings[0].weight.device
     def forward(
         self,
         points: Optional[Tuple[torch.Tensor, torch.Tensor]],
@@ -334,7 +330,11 @@ class PromptEncoder(nn.Module):
             torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
         """
         bs = self._get_batch_size(points, boxes, masks)
-        sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
+        sparse_embeddings = torch.empty(
+            (bs, 0, self.embed_dim),
+            dtype=self.point_embeddings[0].weight.dtype,
+            device=self.point_embeddings[0].weight.device,
+        )
         if points is not None:
             coords, labels = points
             point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
@@ -637,7 +637,7 @@ class FpnNeck(nn.Module):
             lateral_features = self.convs[n - i](x)
             if i in self.fpn_top_down_levels and prev_features is not None:
                 top_down_features = F.interpolate(
-                    prev_features.to(dtype=torch.float32),
+                    prev_features.to(dtype=x.dtype),
                     scale_factor=2.0,
                     mode=self.fpn_interp_model,
                     align_corners=(None if self.fpn_interp_model == "nearest" else False),

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/models/sam/modules/sam.py RENAMED Viewed

@@ -488,7 +488,7 @@ class SAM2Model(torch.nn.Module):
             assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
         else:
             # If no points are provide, pad with an empty point (with label -1)
-            sam_point_coords = torch.zeros(B, 1, 2, device=device)
+            sam_point_coords = torch.zeros(B, 1, 2, device=device, dtype=backbone_features.dtype)
             sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
         # b) Handle mask prompts
@@ -533,7 +533,6 @@ class SAM2Model(torch.nn.Module):
         # convert masks from possibly bfloat16 (or float16) to float32
         # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
-        low_res_multimasks = low_res_multimasks.float()
         high_res_multimasks = F.interpolate(
             low_res_multimasks,
             size=(self.image_size, self.image_size),
@@ -560,12 +559,11 @@ class SAM2Model(torch.nn.Module):
             if self.soft_no_obj_ptr:
                 lambda_is_obj_appearing = object_score_logits.sigmoid()
             else:
-                lambda_is_obj_appearing = is_obj_appearing.float()
+                lambda_is_obj_appearing = is_obj_appearing.to(obj_ptr.dtype)
             if self.fixed_no_obj_ptr:
                 obj_ptr = lambda_is_obj_appearing * obj_ptr
             obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
         return (
             low_res_multimasks,
             high_res_multimasks,
@@ -769,7 +767,7 @@ class SAM2Model(torch.nn.Module):
                     if self.add_tpos_enc_to_obj_ptrs:
                         t_diff_max = max_obj_ptrs_in_encoder - 1
                         tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
-                        obj_pos = torch.tensor(pos_list, device=device)
+                        obj_pos = torch.tensor(pos_list, device=device, dtype=current_vision_feats[-1].dtype)
                         obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
                         obj_pos = self.obj_ptr_tpos_proj(obj_pos)
                         obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
@@ -834,7 +832,7 @@ class SAM2Model(torch.nn.Module):
         # scale the raw mask logits with a temperature before applying sigmoid
         binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
         if binarize and not self.training:
-            mask_for_mem = (pred_masks_high_res > 0).float()
+            mask_for_mem = (pred_masks_high_res > 0).to(pix_feat.dtype)
         else:
             # apply sigmoid on the raw mask logits to turn them into range (0, 1)
             mask_for_mem = torch.sigmoid(pred_masks_high_res)
@@ -927,11 +925,10 @@ class SAM2Model(torch.nn.Module):
     ):
         """Run memory encoder on predicted mask to encode it into a new memory feature for future frames."""
         if run_mem_encoder and self.num_maskmem > 0:
-            high_res_masks_for_mem_enc = high_res_masks
             maskmem_features, maskmem_pos_enc = self._encode_new_memory(
                 current_vision_feats=current_vision_feats,
                 feat_sizes=feat_sizes,
-                pred_masks_high_res=high_res_masks_for_mem_enc,
+                pred_masks_high_res=high_res_masks,
                 object_score_logits=object_score_logits,
                 is_mask_from_pts=(point_inputs is not None),
             )

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/models/sam/modules/utils.py RENAMED Viewed

@@ -78,7 +78,7 @@ def get_1d_sine_pe(pos_inds: torch.Tensor, dim: int, temperature: float = 10000)
         torch.Size([4, 128])
     """
     pe_dim = dim // 2
-    dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
+    dim_t = torch.arange(pe_dim, dtype=pos_inds.dtype, device=pos_inds.device)
     dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
     pos_embed = pos_inds.unsqueeze(-1) / dim_t

{dgenerate_ultralytics_headless-8.3.179 → dgenerate_ultralytics_headless-8.3.181}/ultralytics/models/sam/predict.py RENAMED Viewed

@@ -132,9 +132,9 @@ class Predictor(BasePredictor):
             im = torch.from_numpy(im)
         im = im.to(self.device)
-        im = im.half() if self.model.fp16 else im.float()
         if not_tensor:
             im = (im - self.mean) / self.std
+        im = im.half() if self.model.fp16 else im.float()
         return im
     def pre_transform(self, im):
@@ -182,9 +182,8 @@ class Predictor(BasePredictor):
             **kwargs (Any): Additional keyword arguments.
         Returns:
-            pred_masks (np.ndarray): The output masks in shape (C, H, W), where C is the number of generated masks.
-            pred_scores (np.ndarray): An array of length C containing quality scores predicted by the model for each mask.
-            pred_logits (np.ndarray): Low-resolution logits of shape (C, H, W) for subsequent inference, where H=W=256.
+            pred_masks (torch.Tensor): The output masks in shape (C, H, W), where C is the number of generated masks.
+            pred_scores (torch.Tensor): An array of length C containing quality scores predicted by the model for each mask.
         Examples:
             >>> predictor = Predictor()
@@ -219,8 +218,8 @@ class Predictor(BasePredictor):
             multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
         Returns:
-            pred_masks (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
-            pred_scores (np.ndarray): Quality scores predicted by the model for each mask, with length C.
+            pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
+            pred_scores (torch.Tensor): Quality scores predicted by the model for each mask, with length C.
         Examples:
             >>> predictor = Predictor()
@@ -230,7 +229,33 @@ class Predictor(BasePredictor):
         """
         features = self.get_im_features(im) if self.features is None else self.features
-        bboxes, points, labels, masks = self._prepare_prompts(im.shape[2:], bboxes, points, labels, masks)
+        prompts = self._prepare_prompts(im.shape[2:], self.batch[1][0].shape[:2], bboxes, points, labels, masks)
+        return self._inference_features(features, *prompts, multimask_output)
+    def _inference_features(
+        self,
+        features,
+        bboxes=None,
+        points=None,
+        labels=None,
+        masks=None,
+        multimask_output=False,
+    ):
+        """
+        Perform inference on image features using the SAM model.
+        Args:
+            features (torch.Tensor): Extracted image features with shape (B, C, H, W) from the SAM model image encoder.
+            bboxes (np.ndarray | List[List[float]] | None): Bounding boxes in XYXY format with shape (N, 4).
+            points (np.ndarray | List[List[float]] | None): Object location points with shape (N, 2), in pixels.
+            labels (np.ndarray | List[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
+            masks (List[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
+            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
+        Returns:
+            pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
+            pred_scores (torch.Tensor): Quality scores for each mask, with length C.
+        """
         points = (points, labels) if points is not None else None
         # Embed prompts
         sparse_embeddings, dense_embeddings = self.model.prompt_encoder(points=points, boxes=bboxes, masks=masks)
@@ -248,12 +273,13 @@ class Predictor(BasePredictor):
         # `d` could be 1 or 3 depends on `multimask_output`.
         return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
-    def _prepare_prompts(self, dst_shape, bboxes=None, points=None, labels=None, masks=None):
+    def _prepare_prompts(self, dst_shape, src_shape, bboxes=None, points=None, labels=None, masks=None):
         """
         Prepare and transform the input prompts for processing based on the destination shape.
         Args:
-            dst_shape (tuple): The target shape (height, width) for the prompts.
+            dst_shape (Tuple[int, int]): The target shape (height, width) for the prompts.
+            src_shape (Tuple[int, int]): The source shape (height, width) of the input image.
             bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4).
             points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
             labels (np.ndarray | List | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
@@ -268,11 +294,10 @@ class Predictor(BasePredictor):
         Raises:
             AssertionError: If the number of points don't match the number of labels, in case labels were passed.
         """
-        src_shape = self.batch[1][0].shape[:2]
         r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
         # Transform input prompts
         if points is not None:
-            points = torch.as_tensor(points, dtype=torch.float32, device=self.device)
+            points = torch.as_tensor(points, dtype=self.torch_dtype, device=self.device)
             points = points[None] if points.ndim == 1 else points
             # Assuming labels are all positive if users don't pass labels.
             if labels is None:
@@ -286,11 +311,11 @@ class Predictor(BasePredictor):
                 # (N, 2) --> (N, 1, 2), (N, ) --> (N, 1)
                 points, labels = points[:, None, :], labels[:, None]
         if bboxes is not None:
-            bboxes = torch.as_tensor(bboxes, dtype=torch.float32, device=self.device)
+            bboxes = torch.as_tensor(bboxes, dtype=self.torch_dtype, device=self.device)
             bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes
             bboxes *= r
         if masks is not None:
-            masks = torch.as_tensor(masks, dtype=torch.float32, device=self.device).unsqueeze(1)
+            masks = torch.as_tensor(masks, dtype=self.torch_dtype, device=self.device).unsqueeze(1)
         return bboxes, points, labels, masks
     def generate(
@@ -424,7 +449,8 @@ class Predictor(BasePredictor):
         if model is None:
             model = self.get_model()
         model.eval()
-        self.model = model.to(device)
+        model = model.to(device)
+        self.model = model.half() if self.args.half else model.float()
         self.device = device
         self.mean = torch.tensor([123.675, 116.28, 103.53]).view(-1, 1, 1).to(device)
         self.std = torch.tensor([58.395, 57.12, 57.375]).view(-1, 1, 1).to(device)
@@ -433,8 +459,9 @@ class Predictor(BasePredictor):
         self.model.pt = False
         self.model.triton = False
         self.model.stride = 32
-        self.model.fp16 = False
+        self.model.fp16 = self.args.half
         self.done_warmup = True
+        self.torch_dtype = torch.float16 if self.model.fp16 else torch.float32
     def get_model(self):
         """Retrieve or build the Segment Anything Model (SAM) for image segmentation tasks."""
@@ -543,7 +570,7 @@ class Predictor(BasePredictor):
             - The extracted features are stored in the `self.features` attribute for later use.
         """
         if self.model is None:
-            self.setup_model(model=None)
+            self.setup_model()
         self.setup_source(image)
         assert len(self.dataset) == 1, "`set_image` only supports setting one image!"
         for batch in self.dataset:
@@ -620,6 +647,53 @@ class Predictor(BasePredictor):
         return new_masks[keep].to(device=masks.device, dtype=masks.dtype), keep
+    @smart_inference_mode()
+    def inference_features(
+        self,
+        features,
+        src_shape,
+        dst_shape=None,
+        bboxes=None,
+        points=None,
+        labels=None,
+        masks=None,
+        multimask_output=False,
+    ):
+        """
+        Perform prompts preprocessing and inference on provided image features using the SAM model.
+        Args:
+            features (torch.Tensor | Dict[str, Any]): Extracted image features from the SAM/SAM2 model image encoder.
+            src_shape (Tuple[int, int]): The source shape (height, width) of the input image.
+            dst_shape (Tuple[int, int] | None): The target shape (height, width) for the prompts. If None, defaults to (imgsz, imgsz).
+            bboxes (np.ndarray | List[List[float]] | None): Bounding boxes in xyxy format with shape (N, 4).
+            points (np.ndarray | List[List[float]] | None): Points indicating object locations with shape (N, 2), in pixels.
+            labels (np.ndarray | List[int] | None): Point prompt labels with shape (N, ).
+            masks (List[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
+            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
+        Returns:
+            pred_masks (torch.Tensor): The output masks in shape (C, H, W), where C is the number of generated masks.
+            pred_bboxes (torch.Tensor): Bounding boxes for each mask with shape (N, 6), where N is the number of boxes.
+                Each box is in xyxy format with additional columns for score and class.
+        Notes:
+            - The input features is a torch.Tensor of shape (B, C, H, W) if performing on SAM, or a Dict[str, Any] if performing on SAM2.
+        """
+        dst_shape = dst_shape or (self.args.imgsz, self.args.imgsz)
+        prompts = self._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
+        pred_masks, pred_scores = self._inference_features(features, *prompts, multimask_output)
+        if len(pred_masks) == 0:
+            pred_masks, pred_bboxes = None, torch.zeros((0, 6), device=pred_masks.device)
+        else:
+            pred_masks = ops.scale_masks(pred_masks[None].float(), src_shape, padding=False)[0]
+            pred_masks = pred_masks > self.model.mask_threshold  # to bool
+            pred_bboxes = batched_mask_to_box(pred_masks)
+            # NOTE: SAM models do not return cls info. This `cls` here is just a placeholder for consistency.
+            cls = torch.arange(len(pred_masks), dtype=torch.int32, device=pred_masks.device)
+            pred_bboxes = torch.cat([pred_bboxes, pred_scores[:, None], cls[:, None]], dim=-1)
+        return pred_masks, pred_bboxes
 class SAM2Predictor(Predictor):
     """
@@ -663,80 +737,13 @@ class SAM2Predictor(Predictor):
         return build_sam(self.args.model)
-    def prompt_inference(
-        self,
-        im,
-        bboxes=None,
-        points=None,
-        labels=None,
-        masks=None,
-        multimask_output=False,
-        img_idx=-1,
-    ):
-        """
-        Perform image segmentation inference based on various prompts using SAM2 architecture.
-        This method leverages the Segment Anything Model 2 (SAM2) to generate segmentation masks for input images
-        based on provided prompts such as bounding boxes, points, or existing masks. It supports both single and
-        multi-object prediction scenarios.
-        Args:
-            im (torch.Tensor): Preprocessed input image tensor with shape (N, C, H, W).
-            bboxes (np.ndarray | List[List[float]] | None): Bounding boxes in XYXY format with shape (N, 4).
-            points (np.ndarray | List[List[float]] | None): Object location points with shape (N, 2), in pixels.
-            labels (np.ndarray | List[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
-            masks (np.ndarray | None): Low-resolution masks from previous predictions with shape (N, H, W).
-            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
-            img_idx (int): Index of the image in the batch to process.
-        Returns:
-            pred_masks (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
-            pred_scores (np.ndarray): Quality scores for each mask, with length C.
-        Examples:
-            >>> predictor = SAM2Predictor(cfg)
-            >>> image = torch.rand(1, 3, 640, 640)
-            >>> bboxes = [[100, 100, 200, 200]]
-            >>> result = predictor(image, bboxes=bboxes)[0]
-            >>> print(f"Generated {result.masks.shape[0]} masks with average score {result.boxes.conf.mean():.2f}")
-        Notes:
-            - The method supports batched inference for multiple objects when points or bboxes are provided.
-            - Input prompts (bboxes, points) are automatically scaled to match the input image dimensions.
-            - When both bboxes and points are provided, they are merged into a single 'points' input for the model.
-        """
-        features = self.get_im_features(im) if self.features is None else self.features
-        points, labels, masks = self._prepare_prompts(im.shape[2:], bboxes, points, labels, masks)
-        points = (points, labels) if points is not None else None
-        sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
-            points=points,
-            boxes=None,
-            masks=masks,
-        )
-        # Predict masks
-        batched_mode = points is not None and points[0].shape[0] > 1  # multi object prediction
-        high_res_features = [feat_level[img_idx].unsqueeze(0) for feat_level in features["high_res_feats"]]
-        pred_masks, pred_scores, _, _ = self.model.sam_mask_decoder(
-            image_embeddings=features["image_embed"][img_idx].unsqueeze(0),
-            image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
-            sparse_prompt_embeddings=sparse_embeddings,
-            dense_prompt_embeddings=dense_embeddings,
-            multimask_output=multimask_output,
-            repeat_image=batched_mode,
-            high_res_features=high_res_features,
-        )
-        # (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
-        # `d` could be 1 or 3 depends on `multimask_output`.
-        return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
-    def _prepare_prompts(self, dst_shape, bboxes=None, points=None, labels=None, masks=None):
+    def _prepare_prompts(self, dst_shape, src_shape, bboxes=None, points=None, labels=None, masks=None):
         """
         Prepare and transform the input prompts for processing based on the destination shape.
         Args:
-            dst_shape (tuple): The target shape (height, width) for the prompts.
+            dst_shape (Tuple[int, int]): The target shape (height, width) for the prompts.
+            src_shape (Tuple[int, int]): The source shape (height, width) of the input image.
             bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4).
             points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
             labels (np.ndarray | List | None): Point prompt labels with shape (N,) or (N, num_points). 1 for foreground, 0 for background.
@@ -750,7 +757,7 @@ class SAM2Predictor(Predictor):
         Raises:
             AssertionError: If the number of points don't match the number of labels, in case labels were passed.
         """
-        bboxes, points, labels, masks = super()._prepare_prompts(dst_shape, bboxes, points, labels, masks)
+        bboxes, points, labels, masks = super()._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
         if bboxes is not None:
             bboxes = bboxes.view(-1, 2, 2)
             bbox_labels = torch.tensor([[2, 3]], dtype=torch.int32, device=bboxes.device).expand(len(bboxes), -1)
@@ -813,6 +820,58 @@ class SAM2Predictor(Predictor):
         ][::-1]
         return {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+    def _inference_features(
+        self,
+        features,
+        points=None,
+        labels=None,
+        masks=None,
+        multimask_output=False,
+        img_idx=-1,
+    ):
+        """
+        Perform inference on image features using the SAM2 model.
+        Args:
+            features (torch.Tensor | Dict[str, Any]): Extracted image features with shape (B, C, H, W) from the SAM2 model image encoder, it
+                could also be a dictionary including:
+                - image_embed (torch.Tensor): Image embedding with shape (B, C, H, W).
+                - high_res_feats (List[torch.Tensor]): List of high-resolution feature maps from the backbone, each with shape (B, C, H, W).
+            points (np.ndarray | List[List[float]] | None): Object location points with shape (N, 2), in pixels.
+            labels (np.ndarray | List[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
+            masks (List[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
+            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
+            img_idx (int): Index of the image in the batch to process.
+        Returns:
+            pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
+            pred_scores (torch.Tensor): Quality scores for each mask, with length C.
+        """
+        points = (points, labels) if points is not None else None
+        sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
+            points=points,
+            boxes=None,
+            masks=masks,
+        )
+        # Predict masks
+        batched_mode = points is not None and points[0].shape[0] > 1  # multi object prediction
+        high_res_features = None
+        if isinstance(features, dict):
+            high_res_features = [feat_level[img_idx].unsqueeze(0) for feat_level in features["high_res_feats"]]
+            features = features["image_embed"][[img_idx]]
+        pred_masks, pred_scores, _, _ = self.model.sam_mask_decoder(
+            image_embeddings=features,
+            image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            repeat_image=batched_mode,
+            high_res_features=high_res_features,
+        )
+        # (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
+        # `d` could be 1 or 3 depends on `multimask_output`.
+        return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
 class SAM2VideoPredictor(SAM2Predictor):
     """
@@ -900,8 +959,8 @@ class SAM2VideoPredictor(SAM2Predictor):
             masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
         Returns:
-            pred_masks (np.ndarray): The output masks in shape CxHxW, where C is the number of generated masks.
-            pred_scores (np.ndarray): An array of length C containing quality scores predicted by the model for each mask.
+            pred_masks (torch.Tensor): The output masks in shape CxHxW, where C is the number of generated masks.
+            pred_scores (torch.Tensor): An array of length C containing quality scores predicted by the model for each mask.
         """
         # Override prompts if any stored in self.prompts
         bboxes = self.prompts.pop("bboxes", bboxes)
@@ -912,7 +971,9 @@ class SAM2VideoPredictor(SAM2Predictor):
         self.inference_state["im"] = im
         output_dict = self.inference_state["output_dict"]
         if len(output_dict["cond_frame_outputs"]) == 0:  # initialize prompts
-            points, labels, masks = self._prepare_prompts(im.shape[2:], bboxes, points, labels, masks)
+            points, labels, masks = self._prepare_prompts(
+                im.shape[2:], self.batch[1][0].shape[:2], bboxes, points, labels, masks
+            )
             if points is not None:
                 for i in range(len(points)):
                     self.add_new_prompts(obj_id=i, points=points[[i]], labels=labels[[i]], frame_idx=frame)
@@ -966,7 +1027,7 @@ class SAM2VideoPredictor(SAM2Predictor):
         the masks do not overlap, which can be useful for certain applications.
         Args:
-            preds (tuple): The predictions from the model.
+            preds (Tuple[torch.Tensor, torch.Tensor]): The predicted masks and scores from the model.
             img (torch.Tensor): The processed image tensor.
             orig_imgs (List[np.ndarray]): The original images before processing.
@@ -1441,13 +1502,13 @@ class SAM2VideoPredictor(SAM2Predictor):
             "pred_masks": torch.full(
                 size=(batch_size, 1, self.imgsz[0] // 4, self.imgsz[1] // 4),
                 fill_value=-1024.0,
-                dtype=torch.float32,
+                dtype=self.torch_dtype,
                 device=self.device,
             ),
             "obj_ptr": torch.full(
                 size=(batch_size, self.model.hidden_dim),
                 fill_value=-1024.0,
-                dtype=torch.float32,
+                dtype=self.torch_dtype,
                 device=self.device,
             ),
             "object_score_logits": torch.full(
@@ -1455,7 +1516,7 @@ class SAM2VideoPredictor(SAM2Predictor):
                 # default to 10.0 for object_score_logits, i.e. assuming the object is
                 # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
                 fill_value=10.0,
-                dtype=torch.float32,
+                dtype=self.torch_dtype,
                 device=self.device,
             ),
         }
@@ -1527,7 +1588,7 @@ class SAM2VideoPredictor(SAM2Predictor):
             feat_sizes=feat_sizes,
             point_inputs=None,
             # A dummy (empty) mask with a single object
-            mask_inputs=torch.zeros((1, 1, *self.imgsz), dtype=torch.float32, device=self.device),
+            mask_inputs=torch.zeros((1, 1, *self.imgsz), dtype=self.torch_dtype, device=self.device),
             output_dict={},
             num_frames=self.inference_state["num_frames"],
             track_in_reverse=False,

dgenerate-ultralytics-headless 8.3.179__tar.gz → 8.3.181__tar.gz

dgenerate-ultralytics-headless 8.3.179tar.gz → 8.3.181tar.gz