PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.235__py3-none-any.whl → 8.3.237__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.235py3-none-any.whl → 8.3.237py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/METADATA +1 -1
{dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/RECORD +41 -28
tests/test_exports.py +15 -1
ultralytics/__init__.py +1 -1
ultralytics/engine/exporter.py +113 -12
ultralytics/engine/predictor.py +3 -2
ultralytics/engine/trainer.py +8 -0
ultralytics/models/rtdetr/val.py +5 -1
ultralytics/models/sam/__init__.py +14 -1
ultralytics/models/sam/build.py +17 -8
ultralytics/models/sam/build_sam3.py +374 -0
ultralytics/models/sam/model.py +12 -4
ultralytics/models/sam/modules/blocks.py +20 -8
ultralytics/models/sam/modules/decoders.py +2 -3
ultralytics/models/sam/modules/encoders.py +4 -1
ultralytics/models/sam/modules/memory_attention.py +6 -2
ultralytics/models/sam/modules/sam.py +150 -6
ultralytics/models/sam/modules/utils.py +134 -4
ultralytics/models/sam/predict.py +2076 -118
ultralytics/models/sam/sam3/__init__.py +3 -0
ultralytics/models/sam/sam3/decoder.py +546 -0
ultralytics/models/sam/sam3/encoder.py +535 -0
ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
ultralytics/models/sam/sam3/model_misc.py +198 -0
ultralytics/models/sam/sam3/necks.py +129 -0
ultralytics/models/sam/sam3/sam3_image.py +357 -0
ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
ultralytics/models/sam/sam3/tokenizer_ve.py +242 -0
ultralytics/models/sam/sam3/vitdet.py +546 -0
ultralytics/models/sam/sam3/vl_combiner.py +165 -0
ultralytics/models/yolo/obb/val.py +18 -7
ultralytics/nn/autobackend.py +35 -0
ultralytics/nn/modules/transformer.py +21 -1
ultralytics/utils/checks.py +41 -0
ultralytics/utils/ops.py +1 -3
ultralytics/utils/torch_utils.py +1 -0
{dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -13,7 +13,7 @@ from torch.nn.init import trunc_normal_
 from ultralytics.nn.modules import MLP
 from ultralytics.utils import LOGGER
-from .blocks import SAM2TwoWayTransformer
+from .blocks import SAM2TwoWayTransformer, TwoWayTransformer
 from .decoders import MaskDecoder, SAM2MaskDecoder
 from .encoders import ImageEncoderViT, PromptEncoder
 from .utils import get_1d_sine_pe, select_closest_cond_frames
@@ -329,6 +329,7 @@ class SAM2Model(torch.nn.Module):
         self._build_sam_heads()
         self.max_cond_frames_in_attn = max_cond_frames_in_attn
+        self.add_all_frames_to_correct_as_cond = True
         # Model compilation
         if compile_image_encoder:
@@ -473,7 +474,7 @@ class SAM2Model(torch.nn.Module):
             assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
             if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
                 sam_mask_prompt = F.interpolate(
-                    mask_inputs.float(),
+                    mask_inputs.to(backbone_features.dtype),
                     size=self.sam_prompt_encoder.mask_input_size,
                     align_corners=False,
                     mode="bilinear",
@@ -571,7 +572,7 @@ class SAM2Model(torch.nn.Module):
             # produce an object pointer using the SAM decoder from the mask input
             _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
                 backbone_features=backbone_features,
-                mask_inputs=self.mask_downsample(mask_inputs_float),
+                mask_inputs=self.mask_downsample(mask_inputs_float.to(backbone_features.dtype)),
                 high_res_features=high_res_features,
             )
         # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
@@ -818,7 +819,6 @@ class SAM2Model(torch.nn.Module):
             mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
         maskmem_out = self.memory_encoder(pix_feat, mask_for_mem, skip_mask_sigmoid=True)  # sigmoid already applied
         maskmem_features = maskmem_out["vision_features"]
-        maskmem_pos_enc = maskmem_out["vision_pos_enc"]
         # add a no-object embedding to the spatial memory to indicate that the frame
         # is predicted to be occluded (i.e. no object is appearing in the frame)
         if self.no_obj_embed_spatial is not None:
@@ -827,7 +827,7 @@ class SAM2Model(torch.nn.Module):
                 ..., None, None
             ].expand(*maskmem_features.shape)
-        return maskmem_features, maskmem_pos_enc
+        return maskmem_features, maskmem_out["vision_pos_enc"]
     def _track_step(
         self,
@@ -1005,7 +1005,151 @@ class SAM2Model(torch.nn.Module):
     def set_imgsz(self, imgsz):
         """Set image size to make model compatible with different image sizes."""
+        if hasattr(self.image_encoder, "set_imgsz"):
+            self.image_encoder.set_imgsz(imgsz)
         self.image_size = imgsz[0]
         self.sam_prompt_encoder.input_image_size = imgsz
-        self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16
+        self.sam_prompt_encoder.image_embedding_size = [
+            x // self.backbone_stride for x in imgsz
+        ]  # fixed ViT patch size of 16
+        self.sam_prompt_encoder.mask_input_size = [
+            x // self.backbone_stride * 4 for x in imgsz
+        ]  # fixed ViT patch size of 16
         self.sam_image_embedding_size = self.image_size // self.backbone_stride  # update image embedding size
+class SAM3Model(SAM2Model):
+    """SAM3Model class for Segment Anything Model 3 with memory-based video object segmentation capabilities."""
+    def __init__(
+        self,
+        image_encoder,
+        memory_attention,
+        memory_encoder,
+        num_maskmem=7,
+        image_size=1008,
+        backbone_stride=14,
+        sigmoid_scale_for_mem_enc=1,
+        sigmoid_bias_for_mem_enc=0,
+        binarize_mask_from_pts_for_mem_enc=False,
+        use_mask_input_as_output_without_sam=False,
+        max_cond_frames_in_attn=-1,
+        directly_add_no_mem_embed=False,
+        use_high_res_features_in_sam=False,
+        multimask_output_in_sam=False,
+        multimask_min_pt_num=1,
+        multimask_max_pt_num=1,
+        multimask_output_for_tracking=False,
+        use_multimask_token_for_obj_ptr: bool = False,
+        iou_prediction_use_sigmoid=False,
+        memory_temporal_stride_for_eval=1,
+        non_overlap_masks_for_mem_enc=False,
+        use_obj_ptrs_in_encoder=False,
+        max_obj_ptrs_in_encoder=16,
+        add_tpos_enc_to_obj_ptrs=True,
+        proj_tpos_enc_in_obj_ptrs=False,
+        use_signed_tpos_enc_to_obj_ptrs=False,
+        only_obj_ptrs_in_the_past_for_eval=False,
+        pred_obj_scores: bool = False,
+        pred_obj_scores_mlp: bool = False,
+        fixed_no_obj_ptr: bool = False,
+        soft_no_obj_ptr: bool = False,
+        use_mlp_for_obj_ptr_proj: bool = False,
+        no_obj_embed_spatial: bool = False,
+        sam_mask_decoder_extra_args=None,
+        compile_image_encoder: bool = False,
+    ):
+        """SAM3Model class for Segment Anything Model 3 with memory-based video object segmentation capabilities."""
+        super().__init__(
+            image_encoder,
+            memory_attention,
+            memory_encoder,
+            num_maskmem,
+            image_size,
+            backbone_stride,
+            sigmoid_scale_for_mem_enc,
+            sigmoid_bias_for_mem_enc,
+            binarize_mask_from_pts_for_mem_enc,
+            use_mask_input_as_output_without_sam,
+            max_cond_frames_in_attn,
+            directly_add_no_mem_embed,
+            use_high_res_features_in_sam,
+            multimask_output_in_sam,
+            multimask_min_pt_num,
+            multimask_max_pt_num,
+            multimask_output_for_tracking,
+            use_multimask_token_for_obj_ptr,
+            iou_prediction_use_sigmoid,
+            memory_temporal_stride_for_eval,
+            non_overlap_masks_for_mem_enc,
+            use_obj_ptrs_in_encoder,
+            max_obj_ptrs_in_encoder,
+            add_tpos_enc_to_obj_ptrs,
+            proj_tpos_enc_in_obj_ptrs,
+            use_signed_tpos_enc_to_obj_ptrs,
+            only_obj_ptrs_in_the_past_for_eval,
+            pred_obj_scores,
+            pred_obj_scores_mlp,
+            fixed_no_obj_ptr,
+            soft_no_obj_ptr,
+            use_mlp_for_obj_ptr_proj,
+            no_obj_embed_spatial,
+            sam_mask_decoder_extra_args,
+            compile_image_encoder,
+        )
+        self.sam_mask_decoder = SAM2MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=self.sam_prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=self.sam_prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+            use_high_res_features=self.use_high_res_features_in_sam,
+            iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
+            pred_obj_scores=self.pred_obj_scores,
+            pred_obj_scores_mlp=self.pred_obj_scores_mlp,
+            use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
+            **(self.sam_mask_decoder_extra_args or {}),
+        )
+    def forward_image(self, img_batch: torch.Tensor):
+        """Process image batch through encoder to extract multi-level features for SAM model."""
+        backbone_out = self.image_encoder.forward_image_sam2(img_batch)
+        if self.use_high_res_features_in_sam:
+            # precompute projected level 0 and level 1 features in SAM decoder
+            # to avoid running it again on every SAM click
+            backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(backbone_out["backbone_fpn"][0])
+            backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(backbone_out["backbone_fpn"][1])
+        return backbone_out
+    def set_imgsz(self, imgsz: tuple[int, int]):
+        """Set the image size for the model and mask downsampler."""
+        super().set_imgsz(imgsz)
+        self.memory_encoder.mask_downsampler.interpol_size = [size // 14 * 16 for size in imgsz]
+    @staticmethod
+    def _suppress_shrinked_masks(pred_masks, new_pred_masks, shrink_threshold=0.3):
+        """Suppress masks that shrink in area after applying pixelwise non-overlapping constraints."""
+        area_before = (pred_masks > 0).sum(dim=(-1, -2))
+        area_after = (new_pred_masks > 0).sum(dim=(-1, -2))
+        area_before = torch.clamp(area_before, min=1.0)
+        area_ratio = area_after / area_before
+        keep = area_ratio >= shrink_threshold
+        keep_mask = keep[..., None, None].expand_as(pred_masks)
+        pred_masks_after = torch.where(keep_mask, pred_masks, torch.clamp(pred_masks, max=-10.0))
+        return pred_masks_after
+    def _suppress_object_pw_area_shrinkage(self, pred_masks):
+        """This function suppresses masks that shrink in area after applying pixelwise non-overlapping constraints. Note
+        that the final output can still be overlapping.
+        """
+        # Apply pixel-wise non-overlapping constraint based on mask scores
+        pixel_level_non_overlapping_masks = self._apply_non_overlapping_constraints(pred_masks)
+        # Fully suppress masks with high shrinkage (probably noisy) based on the pixel wise non-overlapping constraints
+        # NOTE: The output of this function can be a no op if none of the masks shrinked by a large factor.
+        pred_masks = self._suppress_shrinked_masks(pred_masks, pixel_level_non_overlapping_masks)
+        return pred_masks

ultralytics/models/sam/modules/utils.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import math
 from typing import Any
 import torch
@@ -86,7 +87,7 @@ def get_1d_sine_pe(pos_inds: torch.Tensor, dim: int, temperature: float = 10000)
     return pos_embed
-def init_t_xy(end_x: int, end_y: int):
+def init_t_xy(end_x: int, end_y: int, scale: float = 1.0, offset: int = 0):
     """Initialize 1D and 2D coordinate tensors for a grid of specified dimensions.
     This function creates coordinate tensors for a grid with dimensions end_x × end_y. It generates a linear index
@@ -95,6 +96,8 @@ def init_t_xy(end_x: int, end_y: int):
     Args:
         end_x (int): Width of the grid (number of columns).
         end_y (int): Height of the grid (number of rows).
+        scale (float): Scaling factor to apply to the coordinates.
+        offset (int): Offset to add to the coordinates.
     Returns:
         t_x (torch.Tensor): X-coordinates for each position, with shape (end_x * end_y).
@@ -110,10 +113,10 @@ def init_t_xy(end_x: int, end_y: int):
     t = torch.arange(end_x * end_y, dtype=torch.float32)
     t_x = (t % end_x).float()
     t_y = torch.div(t, end_x, rounding_mode="floor").float()
-    return t_x, t_y
+    return t_x * scale + offset, t_y * scale + offset
-def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
+def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0, scale_pos: float = 1.0):
     """Compute axial complex exponential positional encodings for 2D spatial positions in a grid.
     This function generates complex exponential positional encodings for a 2D grid of spatial positions, using separate
@@ -124,6 +127,7 @@ def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
         end_x (int): Width of the 2D grid.
         end_y (int): Height of the 2D grid.
         theta (float, optional): Scaling factor for frequency computation.
+        scale_pos (float, optional): Scaling factor for position coordinates.
     Returns:
         (torch.Tensor): Complex exponential positional encodings with shape (end_x*end_y, dim//2).
@@ -137,7 +141,7 @@ def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
     freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
     freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
-    t_x, t_y = init_t_xy(end_x, end_y)
+    t_x, t_y = init_t_xy(end_x, end_y, scale=scale_pos)
     freqs_x = torch.outer(t_x, freqs_x)
     freqs_y = torch.outer(t_y, freqs_y)
     freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
@@ -375,3 +379,129 @@ def add_decomposed_rel_pos(
     )
     return attn
+def get_abs_pos(
+    abs_pos: torch.Tensor,
+    has_cls_token: bool,
+    hw: tuple[int, int],
+    retain_cls_token: bool = False,
+    tiling: bool = False,
+) -> torch.Tensor:
+    """Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
+    original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+        retain_cls_token: whether to retain the cls_token
+        tiling: whether to tile the embeddings, *instead* of interpolation (a la abs_win)
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C),: if retain_cls_token is False,
+            otherwise (1, 1+H*W, C).
+    """
+    if retain_cls_token:
+        assert has_cls_token
+    h, w = hw
+    if has_cls_token:
+        cls_pos = abs_pos[:, :1]
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2)
+        if tiling:
+            new_abs_pos = new_abs_pos.tile([1, 1] + [x // y + 1 for x, y in zip((h, w), new_abs_pos.shape[2:])])[
+                :, :, :h, :w
+            ]
+        else:
+            new_abs_pos = F.interpolate(
+                new_abs_pos,
+                size=(h, w),
+                mode="bicubic",
+                align_corners=False,
+            )
+        if not retain_cls_token:
+            return new_abs_pos.permute(0, 2, 3, 1)
+        else:
+            # add cls_token back, flatten spatial dims
+            assert has_cls_token
+            return torch.cat(
+                [cls_pos, new_abs_pos.permute(0, 2, 3, 1).reshape(1, h * w, -1)],
+                dim=1,
+            )
+    else:
+        if not retain_cls_token:
+            return abs_pos.reshape(1, h, w, -1)
+        else:
+            assert has_cls_token
+            return torch.cat([cls_pos, abs_pos], dim=1)
+def concat_rel_pos(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    q_hw: tuple[int, int],
+    k_hw: tuple[int, int],
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    rescale: bool = False,
+    relative_coords: torch.Tensor = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Concatenate rel pos coeffs to the q & k tensors, so that qk^T is now effectively including rel pos biases.
+    Args:
+        q (torch.Tensor): q tensor with shape (B, L_q, C).
+        k (torch.Tensor): k tensor with shape (B, L_k, C).
+        q_hw: These are spatial size of q tensors.
+        k_hw: These are spatial size of k tensors.
+        rel_pos_h: These are relative pos embeddings/params of height.
+        rel_pos_w: These are relative pos embeddings/params of width.
+        rescale (bool): whether to rescale. e.g. for use when using sdpa, pytorch will scale by the wrong factor due to
+            the concat.
+        relative_coords (torch.Tensor, optional): Precomputed relative coords index tensor.
+    Returns:
+        q, k: But, padded so that qk^T accounts for rel pos biases.
+    """
+    q_h, q_w = q_hw
+    k_h, k_w = k_hw
+    assert (q_h == q_w) and (k_h == k_w), "only square inputs supported"
+    if relative_coords is not None:
+        Rh = rel_pos_h[relative_coords]
+        Rw = rel_pos_w[relative_coords]
+    else:
+        Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+        Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    old_scale = dim**0.5
+    new_scale = (dim + k_h + k_w) ** 0.5 if rescale else old_scale  # for sdpa
+    # attn will be divided by new_scale, but we want to divide q by old_scale
+    scale_ratio = new_scale / old_scale
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) * new_scale  # (B, q_h, q_w, k_h)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) * new_scale  # (B, q_h, q_w, k_w)
+    eye_h = torch.eye(k_h, dtype=q.dtype, device=q.device)
+    eye_w = torch.eye(k_w, dtype=q.dtype, device=q.device)
+    eye_h = eye_h.view(1, k_h, 1, k_h).expand([B, k_h, k_w, k_h])
+    eye_w = eye_w.view(1, 1, k_w, k_w).expand([B, k_h, k_w, k_w])
+    q = torch.cat([r_q * scale_ratio, rel_h, rel_w], dim=-1).view(B, q_h * q_w, -1)
+    k = torch.cat([k.view(B, k_h, k_w, -1), eye_h, eye_w], dim=-1).view(B, k_h * k_w, -1)
+    return q, k

dgenerate-ultralytics-headless 8.3.235__py3-none-any.whl → 8.3.237__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.235py3-none-any.whl → 8.3.237py3-none-any.whl