PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.137py3-none-any.whl → 8.3.224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/METADATA +41 -34
dgenerate_ultralytics_headless-8.3.224.dist-info/RECORD +285 -0
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/WHEEL +1 -1
tests/__init__.py +7 -6
tests/conftest.py +15 -39
tests/test_cli.py +17 -17
tests/test_cuda.py +17 -8
tests/test_engine.py +36 -10
tests/test_exports.py +98 -37
tests/test_integrations.py +12 -15
tests/test_python.py +126 -82
tests/test_solutions.py +319 -135
ultralytics/__init__.py +27 -9
ultralytics/cfg/__init__.py +83 -87
ultralytics/cfg/datasets/Argoverse.yaml +4 -4
ultralytics/cfg/datasets/DOTAv1.5.yaml +2 -2
ultralytics/cfg/datasets/DOTAv1.yaml +2 -2
ultralytics/cfg/datasets/GlobalWheat2020.yaml +2 -2
ultralytics/cfg/datasets/HomeObjects-3K.yaml +4 -5
ultralytics/cfg/datasets/ImageNet.yaml +3 -3
ultralytics/cfg/datasets/Objects365.yaml +24 -20
ultralytics/cfg/datasets/SKU-110K.yaml +9 -9
ultralytics/cfg/datasets/VOC.yaml +10 -13
ultralytics/cfg/datasets/VisDrone.yaml +43 -33
ultralytics/cfg/datasets/african-wildlife.yaml +5 -5
ultralytics/cfg/datasets/brain-tumor.yaml +4 -5
ultralytics/cfg/datasets/carparts-seg.yaml +5 -5
ultralytics/cfg/datasets/coco-pose.yaml +26 -4
ultralytics/cfg/datasets/coco.yaml +4 -4
ultralytics/cfg/datasets/coco128-seg.yaml +2 -2
ultralytics/cfg/datasets/coco128.yaml +2 -2
ultralytics/cfg/datasets/coco8-grayscale.yaml +103 -0
ultralytics/cfg/datasets/coco8-multispectral.yaml +2 -2
ultralytics/cfg/datasets/coco8-pose.yaml +23 -2
ultralytics/cfg/datasets/coco8-seg.yaml +2 -2
ultralytics/cfg/datasets/coco8.yaml +2 -2
ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
ultralytics/cfg/datasets/crack-seg.yaml +5 -5
ultralytics/cfg/datasets/dog-pose.yaml +32 -4
ultralytics/cfg/datasets/dota8-multispectral.yaml +2 -2
ultralytics/cfg/datasets/dota8.yaml +2 -2
ultralytics/cfg/datasets/hand-keypoints.yaml +29 -4
ultralytics/cfg/datasets/lvis.yaml +9 -9
ultralytics/cfg/datasets/medical-pills.yaml +4 -5
ultralytics/cfg/datasets/open-images-v7.yaml +7 -10
ultralytics/cfg/datasets/package-seg.yaml +5 -5
ultralytics/cfg/datasets/signature.yaml +4 -4
ultralytics/cfg/datasets/tiger-pose.yaml +20 -4
ultralytics/cfg/datasets/xView.yaml +5 -5
ultralytics/cfg/default.yaml +96 -93
ultralytics/cfg/trackers/botsort.yaml +16 -17
ultralytics/cfg/trackers/bytetrack.yaml +9 -11
ultralytics/data/__init__.py +4 -4
ultralytics/data/annotator.py +12 -12
ultralytics/data/augment.py +531 -564
ultralytics/data/base.py +76 -81
ultralytics/data/build.py +206 -42
ultralytics/data/converter.py +179 -78
ultralytics/data/dataset.py +121 -121
ultralytics/data/loaders.py +114 -91
ultralytics/data/split.py +28 -15
ultralytics/data/split_dota.py +67 -48
ultralytics/data/utils.py +110 -89
ultralytics/engine/exporter.py +422 -460
ultralytics/engine/model.py +224 -252
ultralytics/engine/predictor.py +94 -89
ultralytics/engine/results.py +345 -595
ultralytics/engine/trainer.py +231 -134
ultralytics/engine/tuner.py +279 -73
ultralytics/engine/validator.py +53 -46
ultralytics/hub/__init__.py +26 -28
ultralytics/hub/auth.py +30 -16
ultralytics/hub/google/__init__.py +34 -36
ultralytics/hub/session.py +53 -77
ultralytics/hub/utils.py +23 -109
ultralytics/models/__init__.py +1 -1
ultralytics/models/fastsam/__init__.py +1 -1
ultralytics/models/fastsam/model.py +36 -18
ultralytics/models/fastsam/predict.py +33 -44
ultralytics/models/fastsam/utils.py +4 -5
ultralytics/models/fastsam/val.py +12 -14
ultralytics/models/nas/__init__.py +1 -1
ultralytics/models/nas/model.py +16 -20
ultralytics/models/nas/predict.py +12 -14
ultralytics/models/nas/val.py +4 -5
ultralytics/models/rtdetr/__init__.py +1 -1
ultralytics/models/rtdetr/model.py +9 -9
ultralytics/models/rtdetr/predict.py +22 -17
ultralytics/models/rtdetr/train.py +20 -16
ultralytics/models/rtdetr/val.py +79 -59
ultralytics/models/sam/__init__.py +8 -2
ultralytics/models/sam/amg.py +53 -38
ultralytics/models/sam/build.py +29 -31
ultralytics/models/sam/model.py +33 -38
ultralytics/models/sam/modules/blocks.py +159 -182
ultralytics/models/sam/modules/decoders.py +38 -47
ultralytics/models/sam/modules/encoders.py +114 -133
ultralytics/models/sam/modules/memory_attention.py +38 -31
ultralytics/models/sam/modules/sam.py +114 -93
ultralytics/models/sam/modules/tiny_encoder.py +268 -291
ultralytics/models/sam/modules/transformer.py +59 -66
ultralytics/models/sam/modules/utils.py +55 -72
ultralytics/models/sam/predict.py +745 -341
ultralytics/models/utils/loss.py +118 -107
ultralytics/models/utils/ops.py +118 -71
ultralytics/models/yolo/__init__.py +1 -1
ultralytics/models/yolo/classify/predict.py +28 -26
ultralytics/models/yolo/classify/train.py +50 -81
ultralytics/models/yolo/classify/val.py +68 -61
ultralytics/models/yolo/detect/predict.py +12 -15
ultralytics/models/yolo/detect/train.py +56 -46
ultralytics/models/yolo/detect/val.py +279 -223
ultralytics/models/yolo/model.py +167 -86
ultralytics/models/yolo/obb/predict.py +7 -11
ultralytics/models/yolo/obb/train.py +23 -25
ultralytics/models/yolo/obb/val.py +107 -99
ultralytics/models/yolo/pose/__init__.py +1 -1
ultralytics/models/yolo/pose/predict.py +12 -14
ultralytics/models/yolo/pose/train.py +31 -69
ultralytics/models/yolo/pose/val.py +119 -254
ultralytics/models/yolo/segment/predict.py +21 -25
ultralytics/models/yolo/segment/train.py +12 -66
ultralytics/models/yolo/segment/val.py +126 -305
ultralytics/models/yolo/world/train.py +53 -45
ultralytics/models/yolo/world/train_world.py +51 -32
ultralytics/models/yolo/yoloe/__init__.py +7 -7
ultralytics/models/yolo/yoloe/predict.py +30 -37
ultralytics/models/yolo/yoloe/train.py +89 -71
ultralytics/models/yolo/yoloe/train_seg.py +15 -17
ultralytics/models/yolo/yoloe/val.py +56 -41
ultralytics/nn/__init__.py +9 -11
ultralytics/nn/autobackend.py +179 -107
ultralytics/nn/modules/__init__.py +67 -67
ultralytics/nn/modules/activation.py +8 -7
ultralytics/nn/modules/block.py +302 -323
ultralytics/nn/modules/conv.py +61 -104
ultralytics/nn/modules/head.py +488 -186
ultralytics/nn/modules/transformer.py +183 -123
ultralytics/nn/modules/utils.py +15 -20
ultralytics/nn/tasks.py +327 -203
ultralytics/nn/text_model.py +81 -65
ultralytics/py.typed +1 -0
ultralytics/solutions/__init__.py +12 -12
ultralytics/solutions/ai_gym.py +19 -27
ultralytics/solutions/analytics.py +36 -26
ultralytics/solutions/config.py +29 -28
ultralytics/solutions/distance_calculation.py +23 -24
ultralytics/solutions/heatmap.py +17 -19
ultralytics/solutions/instance_segmentation.py +21 -19
ultralytics/solutions/object_blurrer.py +16 -17
ultralytics/solutions/object_counter.py +48 -53
ultralytics/solutions/object_cropper.py +22 -16
ultralytics/solutions/parking_management.py +61 -58
ultralytics/solutions/queue_management.py +19 -19
ultralytics/solutions/region_counter.py +63 -50
ultralytics/solutions/security_alarm.py +22 -25
ultralytics/solutions/similarity_search.py +107 -60
ultralytics/solutions/solutions.py +343 -262
ultralytics/solutions/speed_estimation.py +35 -31
ultralytics/solutions/streamlit_inference.py +104 -40
ultralytics/solutions/templates/similarity-search.html +31 -24
ultralytics/solutions/trackzone.py +24 -24
ultralytics/solutions/vision_eye.py +11 -12
ultralytics/trackers/__init__.py +1 -1
ultralytics/trackers/basetrack.py +18 -27
ultralytics/trackers/bot_sort.py +48 -39
ultralytics/trackers/byte_tracker.py +94 -94
ultralytics/trackers/track.py +7 -16
ultralytics/trackers/utils/gmc.py +37 -69
ultralytics/trackers/utils/kalman_filter.py +68 -76
ultralytics/trackers/utils/matching.py +13 -17
ultralytics/utils/__init__.py +251 -275
ultralytics/utils/autobatch.py +19 -7
ultralytics/utils/autodevice.py +68 -38
ultralytics/utils/benchmarks.py +169 -130
ultralytics/utils/callbacks/base.py +12 -13
ultralytics/utils/callbacks/clearml.py +14 -15
ultralytics/utils/callbacks/comet.py +139 -66
ultralytics/utils/callbacks/dvc.py +19 -27
ultralytics/utils/callbacks/hub.py +8 -6
ultralytics/utils/callbacks/mlflow.py +6 -10
ultralytics/utils/callbacks/neptune.py +11 -19
ultralytics/utils/callbacks/platform.py +73 -0
ultralytics/utils/callbacks/raytune.py +3 -4
ultralytics/utils/callbacks/tensorboard.py +9 -12
ultralytics/utils/callbacks/wb.py +33 -30
ultralytics/utils/checks.py +163 -114
ultralytics/utils/cpu.py +89 -0
ultralytics/utils/dist.py +24 -20
ultralytics/utils/downloads.py +176 -146
ultralytics/utils/errors.py +11 -13
ultralytics/utils/events.py +113 -0
ultralytics/utils/export/__init__.py +7 -0
ultralytics/utils/{export.py → export/engine.py} +81 -63
ultralytics/utils/export/imx.py +294 -0
ultralytics/utils/export/tensorflow.py +217 -0
ultralytics/utils/files.py +33 -36
ultralytics/utils/git.py +137 -0
ultralytics/utils/instance.py +105 -120
ultralytics/utils/logger.py +404 -0
ultralytics/utils/loss.py +99 -61
ultralytics/utils/metrics.py +649 -478
ultralytics/utils/nms.py +337 -0
ultralytics/utils/ops.py +263 -451
ultralytics/utils/patches.py +70 -31
ultralytics/utils/plotting.py +253 -223
ultralytics/utils/tal.py +48 -61
ultralytics/utils/torch_utils.py +244 -251
ultralytics/utils/tqdm.py +438 -0
ultralytics/utils/triton.py +22 -23
ultralytics/utils/tuner.py +11 -10
dgenerate_ultralytics_headless-8.3.137.dist-info/RECORD +0 -272
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/memory_attention.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from __future__ import annotations
 import copy
-from typing import Optional
 import torch
-from torch import Tensor, nn
+from torch import nn
 from .blocks import RoPEAttention
 class MemoryAttentionLayer(nn.Module):
-    """
-    Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.
+    """Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.
     This class combines self-attention, cross-attention, and feedforward components to process input tensors and
     generate memory-based attention outputs.
@@ -60,8 +60,7 @@ class MemoryAttentionLayer(nn.Module):
         pos_enc_at_cross_attn_keys: bool = True,
         pos_enc_at_cross_attn_queries: bool = False,
     ):
-        """
-        Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.
+        """Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.
         Args:
             d_model (int): Dimensionality of the model.
@@ -103,7 +102,7 @@ class MemoryAttentionLayer(nn.Module):
         self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
         self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
-    def _forward_sa(self, tgt: Tensor, query_pos: Optional[Tensor]) -> Tensor:
+    def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
         """Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
         tgt2 = self.norm1(tgt)
         q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
@@ -113,12 +112,12 @@ class MemoryAttentionLayer(nn.Module):
     def _forward_ca(
         self,
-        tgt: Tensor,
-        memory: Tensor,
-        query_pos: Optional[Tensor],
-        pos: Optional[Tensor],
+        tgt: torch.Tensor,
+        memory: torch.Tensor,
+        query_pos: torch.Tensor | None,
+        pos: torch.Tensor | None,
         num_k_exclude_rope: int = 0,
-    ) -> Tensor:
+    ) -> torch.Tensor:
         """Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
         kwds = {}
         if num_k_exclude_rope > 0:
@@ -138,13 +137,24 @@ class MemoryAttentionLayer(nn.Module):
     def forward(
         self,
-        tgt: Tensor,
-        memory: Tensor,
-        pos: Optional[Tensor] = None,
-        query_pos: Optional[Tensor] = None,
+        tgt: torch.Tensor,
+        memory: torch.Tensor,
+        pos: torch.Tensor | None = None,
+        query_pos: torch.Tensor | None = None,
         num_k_exclude_rope: int = 0,
     ) -> torch.Tensor:
-        """Process input tensors through self-attention, cross-attention, and feedforward network layers."""
+        """Process input tensors through self-attention, cross-attention, and feedforward network layers.
+        Args:
+            tgt (torch.Tensor): Target tensor for self-attention with shape (N, L, D).
+            memory (torch.Tensor): Memory tensor for cross-attention with shape (N, S, D).
+            pos (Optional[torch.Tensor]): Positional encoding for memory tensor.
+            query_pos (Optional[torch.Tensor]): Positional encoding for target tensor.
+            num_k_exclude_rope (int): Number of keys to exclude from rotary position embedding.
+        Returns:
+            (torch.Tensor): Processed tensor after attention and feedforward layers with shape (N, L, D).
+        """
         tgt = self._forward_sa(tgt, query_pos)
         tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
         # MLP
@@ -155,11 +165,10 @@ class MemoryAttentionLayer(nn.Module):
 class MemoryAttention(nn.Module):
-    """
-    Memory attention module for processing sequential data with self and cross-attention mechanisms.
+    """Memory attention module for processing sequential data with self and cross-attention mechanisms.
-    This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
-    for processing sequential data, particularly useful in transformer-like architectures.
+    This class implements a multi-layer attention mechanism that combines self-attention and cross-attention for
+    processing sequential data, particularly useful in transformer-like architectures.
     Attributes:
         d_model (int): The dimension of the model's hidden state.
@@ -193,11 +202,10 @@ class MemoryAttention(nn.Module):
         num_layers: int,
         batch_first: bool = True,  # Do layers expect batch first input?
     ):
-        """
-        Initialize MemoryAttention with specified layers and normalization for sequential data processing.
+        """Initialize MemoryAttention with specified layers and normalization for sequential data processing.
-        This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
-        for processing sequential data, particularly useful in transformer-like architectures.
+        This class implements a multi-layer attention mechanism that combines self-attention and cross-attention for
+        processing sequential data, particularly useful in transformer-like architectures.
         Args:
             d_model (int): The dimension of the model's hidden state.
@@ -230,18 +238,17 @@ class MemoryAttention(nn.Module):
         self,
         curr: torch.Tensor,  # self-attention inputs
         memory: torch.Tensor,  # cross-attention inputs
-        curr_pos: Optional[Tensor] = None,  # pos_enc for self-attention inputs
-        memory_pos: Optional[Tensor] = None,  # pos_enc for cross-attention inputs
+        curr_pos: torch.Tensor | None = None,  # pos_enc for self-attention inputs
+        memory_pos: torch.Tensor | None = None,  # pos_enc for cross-attention inputs
         num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
     ) -> torch.Tensor:
-        """
-        Process inputs through attention layers, applying self and cross-attention with positional encoding.
+        """Process inputs through attention layers, applying self and cross-attention with positional encoding.
         Args:
             curr (torch.Tensor): Self-attention input tensor, representing the current state.
             memory (torch.Tensor): Cross-attention input tensor, representing memory information.
-            curr_pos (Optional[Tensor]): Positional encoding for self-attention inputs.
-            memory_pos (Optional[Tensor]): Positional encoding for cross-attention inputs.
+            curr_pos (Optional[torch.Tensor]): Positional encoding for self-attention inputs.
+            memory_pos (Optional[torch.Tensor]): Positional encoding for cross-attention inputs.
             num_obj_ptr_tokens (int): Number of object pointer tokens to exclude from rotary position embedding.
         Returns:

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -3,10 +3,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from typing import List
+from __future__ import annotations
 import torch
 import torch.nn.functional as F
@@ -26,20 +23,21 @@ NO_OBJ_SCORE = -1024.0
 class SAMModel(nn.Module):
-    """
-    Segment Anything Model (SAM) for object segmentation tasks.
+    """Segment Anything Model (SAM) for object segmentation tasks.
-    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
-    and input prompts.
+    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images and input
+    prompts.
     Attributes:
         mask_threshold (float): Threshold value for mask prediction.
         image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
         prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
         mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
+        pixel_mean (torch.Tensor): Mean values for normalizing pixels in the input image.
+        pixel_std (torch.Tensor): Standard deviation values for normalizing pixels in the input image.
     Methods:
-        __init__: Initializes the SAMModel with encoders, decoder, and normalization parameters.
+        set_imgsz: Set image size to make model compatible with different image sizes.
     Examples:
         >>> image_encoder = ImageEncoderViT(...)
@@ -59,18 +57,17 @@ class SAMModel(nn.Module):
         image_encoder: ImageEncoderViT,
         prompt_encoder: PromptEncoder,
         mask_decoder: MaskDecoder,
-        pixel_mean: List[float] = (123.675, 116.28, 103.53),
-        pixel_std: List[float] = (58.395, 57.12, 57.375),
+        pixel_mean: list[float] = (123.675, 116.28, 103.53),
+        pixel_std: list[float] = (58.395, 57.12, 57.375),
     ) -> None:
-        """
-        Initialize the SAMModel class to predict object masks from an image and input prompts.
+        """Initialize the SAMModel class to predict object masks from an image and input prompts.
         Args:
             image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
             prompt_encoder (PromptEncoder): Encodes various types of input prompts.
             mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
-            pixel_mean (List[float]): Mean values for normalizing pixels in the input image.
-            pixel_std (List[float]): Std values for normalizing pixels in the input image.
+            pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
+            pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.
         Examples:
             >>> image_encoder = ImageEncoderViT(...)
@@ -90,12 +87,7 @@ class SAMModel(nn.Module):
         self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
     def set_imgsz(self, imgsz):
-        """
-        Set image size to make model compatible with different image sizes.
-        Args:
-            imgsz (Tuple[int, int]): The size of the input image.
-        """
+        """Set image size to make model compatible with different image sizes."""
         if hasattr(self.image_encoder, "set_imgsz"):
             self.image_encoder.set_imgsz(imgsz)
         self.prompt_encoder.input_image_size = imgsz
@@ -104,11 +96,10 @@ class SAMModel(nn.Module):
 class SAM2Model(torch.nn.Module):
-    """
-    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.
+    """SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.
-    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
-    for temporal consistency and efficient tracking of objects across frames.
+    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms for temporal
+    consistency and efficient tracking of objects across frames.
     Attributes:
         mask_threshold (float): Threshold value for mask prediction.
@@ -124,10 +115,48 @@ class SAM2Model(torch.nn.Module):
         sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
         obj_ptr_proj (nn.Module): Projection layer for object pointers.
         obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
+        hidden_dim (int): Hidden dimension of the model.
+        mem_dim (int): Memory dimension for encoding features.
+        use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
+        use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
+        max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder cross-attention.
+        add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers.
+        proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
+            encoding in object pointers.
+        use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in temporal positional encoding.
+        only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
+            evaluation.
+        pred_obj_scores (bool): Whether to predict if there is an object in the frame.
+        pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
+        fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
+        soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
+        use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
+        no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
+        max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
+        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the first
+            frame.
+        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial conditioning
+            frames.
+        multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
+        multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
+        multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
+        use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
+        iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
+        memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
+        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in memory
+            encoder during evaluation.
+        sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
+        sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
+        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames with
+            clicks during evaluation.
+        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM prompt
+            encoder and mask decoder on frames with mask input.
     Methods:
-        forward_image: Processes image batch through encoder to extract multi-level features.
-        track_step: Performs a single tracking step, updating object masks and memory features.
+        forward_image: Process image batch through encoder to extract multi-level features.
+        track_step: Perform a single tracking step, updating object masks and memory features.
+        set_binarize: Set binarize for VideoPredictor.
+        set_imgsz: Set image size to make model compatible with different image sizes.
     Examples:
         >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
@@ -176,56 +205,53 @@ class SAM2Model(torch.nn.Module):
         sam_mask_decoder_extra_args=None,
         compile_image_encoder: bool = False,
     ):
-        """
-        Initialize the SAM2Model for video object segmentation with memory-based tracking.
+        """Initialize the SAM2Model for video object segmentation with memory-based tracking.
         Args:
             image_encoder (nn.Module): Visual encoder for extracting image features.
             memory_attention (nn.Module): Module for attending to memory features.
             memory_encoder (nn.Module): Encoder for generating memory representations.
-            num_maskmem (int): Number of accessible memory frames. Default is 7 (1 input frame + 6 previous frames).
+            num_maskmem (int): Number of accessible memory frames.
             image_size (int): Size of input images.
             backbone_stride (int): Stride of the image backbone output.
             sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
             sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
-            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
-                with clicks during evaluation.
+            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames with
+                clicks during evaluation.
             use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                 prompt encoder and mask decoder on frames with mask input.
             max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
-                -1 means no limit.
-            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
-                first frame.
+            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the first
+                frame.
             use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
-            multimask_output_in_sam (bool): Whether to output multiple (3) masks for the first click on initial
-                conditioning frames.
+            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial conditioning
+                frames.
             multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
             multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
             multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
             use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
             iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
             memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
-            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
-                memory encoder during evaluation.
+            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in memory
+                encoder during evaluation.
             use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
             max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                 cross-attention.
-            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
-                the encoder.
+            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in the
+                encoder.
             proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                 encoding in object pointers.
-            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance (instead of unsigned absolute distance)
-                in the temporal positional encoding in the object pointers, only relevant when both
-                `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`.
-            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
-                during evaluation.
+            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
+                in the object pointers.
+            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
+                evaluation.
             pred_obj_scores (bool): Whether to predict if there is an object in the frame.
             pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
             fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
             soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
             use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
             no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
-            sam_mask_decoder_extra_args (Dict | None): Extra arguments for constructing the SAM mask decoder.
+            sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
             compile_image_encoder (bool): Whether to compile the image encoder for faster inference.
         Examples:
@@ -398,36 +424,32 @@ class SAM2Model(torch.nn.Module):
         high_res_features=None,
         multimask_output=False,
     ):
-        """
-        Forward pass through SAM prompt encoders and mask heads.
+        """Forward pass through SAM prompt encoders and mask heads.
         This method processes image features and optional point/mask inputs to generate object masks and scores.
         Args:
             backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
-            point_inputs (Dict[str, torch.Tensor] | None): Dictionary containing point prompts.
-                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
-                    pixel-unit coordinates in (x, y) format for P input points.
-                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
-                    0 means negative clicks, and -1 means padding.
-            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
-                same spatial size as the image.
-            high_res_features (List[torch.Tensor] | None): List of two feature maps with shapes
-                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
-                for SAM decoder.
-            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
-                output only 1 mask and its IoU estimate.
+            point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
+            'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute pixel-unit coordinates in
+                (x, y) format for P input points.
+            'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks, 0 means negative
+                clicks, and -1 means padding.
+            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the same spatial
+                size as the image.
+            high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes (B, C, 4*H, 4*W) and (B,
+                C, 2*H, 2*W) respectively, used as high-resolution feature maps for SAM decoder.
+            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False, output only 1
+                mask and its IoU estimate.
         Returns:
-            (Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
-                low_res_multimasks: Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
-                high_res_multimasks: Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
-                ious: Tensor of shape (B, M) with estimated IoU for each output mask.
-                low_res_masks: Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
-                high_res_masks: Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
-                obj_ptr: Tensor of shape (B, C) with object pointer vector for the output mask.
-                object_score_logits: Tensor of shape (B) with object score logits.
-                Where M is 3 if multimask_output=True, and 1 if multimask_output=False.
+            low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
+            high_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
+            ious (torch.Tensor): Tensor of shape (B, M) with estimated IoU for each output mask.
+            low_res_masks (torch.Tensor): Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
+            high_res_masks (torch.Tensor): Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
+            obj_ptr (torch.Tensor): Tensor of shape (B, C) with object pointer vector for the output mask.
+            object_score_logits (torch.Tensor): Tensor of shape (B) with object score logits.
         Examples:
             >>> backbone_features = torch.rand(1, 256, 32, 32)
@@ -444,7 +466,7 @@ class SAM2Model(torch.nn.Module):
             ...     object_score_logits,
             ... ) = results
         """
-        B = backbone_features.size(0)
+        B = backbone_features.shape[0]
         device = backbone_features.device
         assert backbone_features.size(1) == self.sam_prompt_embed_dim
         assert backbone_features.size(2) == self.sam_image_embedding_size
@@ -454,10 +476,10 @@ class SAM2Model(torch.nn.Module):
         if point_inputs is not None:
             sam_point_coords = point_inputs["point_coords"]
             sam_point_labels = point_inputs["point_labels"]
-            assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+            assert sam_point_coords.shape[0] == B and sam_point_labels.shape[0] == B
         else:
             # If no points are provide, pad with an empty point (with label -1)
-            sam_point_coords = torch.zeros(B, 1, 2, device=device)
+            sam_point_coords = torch.zeros(B, 1, 2, device=device, dtype=backbone_features.dtype)
             sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
         # b) Handle mask prompts
@@ -502,7 +524,6 @@ class SAM2Model(torch.nn.Module):
         # convert masks from possibly bfloat16 (or float16) to float32
         # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
-        low_res_multimasks = low_res_multimasks.float()
         high_res_multimasks = F.interpolate(
             low_res_multimasks,
             size=(self.image_size, self.image_size),
@@ -529,12 +550,11 @@ class SAM2Model(torch.nn.Module):
             if self.soft_no_obj_ptr:
                 lambda_is_obj_appearing = object_score_logits.sigmoid()
             else:
-                lambda_is_obj_appearing = is_obj_appearing.float()
+                lambda_is_obj_appearing = is_obj_appearing.to(obj_ptr.dtype)
             if self.fixed_no_obj_ptr:
                 obj_ptr = lambda_is_obj_appearing * obj_ptr
             obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
         return (
             low_res_multimasks,
             high_res_multimasks,
@@ -545,7 +565,7 @@ class SAM2Model(torch.nn.Module):
             object_score_logits,
         )
-    def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
+    def _use_mask_as_output(self, mask_inputs, backbone_features=None, high_res_features=None):
         """Process mask inputs directly as output, bypassing SAM encoder/decoder."""
         # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
         out_scale, out_bias = 20.0, -10.0  # sigmoid(-10.0)=4.5398e-05
@@ -559,10 +579,10 @@ class SAM2Model(torch.nn.Module):
             antialias=True,  # use antialias for downsampling
         )
         # a dummy IoU prediction of all 1's under mask input
-        ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
-        if not self.use_obj_ptrs_in_encoder:
+        ious = mask_inputs.new_ones(mask_inputs.shape[0], 1).float()
+        if not self.use_obj_ptrs_in_encoder or backbone_features is None or high_res_features is None:
             # all zeros as a dummy object pointer (of shape [B, C])
-            obj_ptr = torch.zeros(mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device)
+            obj_ptr = torch.zeros(mask_inputs.shape[0], self.hidden_dim, device=mask_inputs.device)
         else:
             # produce an object pointer using the SAM decoder from the mask input
             _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
@@ -686,7 +706,7 @@ class SAM2Model(torch.nn.Module):
                     continue  # skip padding frames
                 # "maskmem_features" might have been offloaded to CPU in demo use cases,
                 # so we load it back to inference device (it's a no-op if it's already on device).
-                feats = prev["maskmem_features"].to(device=device, non_blocking=True)
+                feats = prev["maskmem_features"].to(device=device, non_blocking=device.type == "cuda")
                 to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
                 # Spatial positional encoding (it might have been offloaded to CPU in eval)
                 maskmem_enc = prev["maskmem_pos_enc"][-1].to(device=device)
@@ -738,7 +758,7 @@ class SAM2Model(torch.nn.Module):
                     if self.add_tpos_enc_to_obj_ptrs:
                         t_diff_max = max_obj_ptrs_in_encoder - 1
                         tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
-                        obj_pos = torch.tensor(pos_list, device=device)
+                        obj_pos = torch.tensor(pos_list, device=device, dtype=current_vision_feats[-1].dtype)
                         obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
                         obj_pos = self.obj_ptr_tpos_proj(obj_pos)
                         obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
@@ -803,7 +823,7 @@ class SAM2Model(torch.nn.Module):
         # scale the raw mask logits with a temperature before applying sigmoid
         binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
         if binarize and not self.training:
-            mask_for_mem = (pred_masks_high_res > 0).float()
+            mask_for_mem = (pred_masks_high_res > 0).to(pix_feat.dtype)
         else:
             # apply sigmoid on the raw mask logits to turn them into range (0, 1)
             mask_for_mem = torch.sigmoid(pred_masks_high_res)
@@ -840,7 +860,6 @@ class SAM2Model(torch.nn.Module):
         prev_sam_mask_logits,
     ):
         """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
-        current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
         # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
         if len(current_vision_feats) > 1:
             high_res_features = [
@@ -854,7 +873,7 @@ class SAM2Model(torch.nn.Module):
             # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
             pix_feat = current_vision_feats[-1].permute(1, 2, 0)
             pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
-            sam_outputs = self._use_mask_as_output(pix_feat, high_res_features, mask_inputs)
+            sam_outputs = self._use_mask_as_output(mask_inputs, pix_feat, high_res_features)
         else:
             # fused the visual feature with previous memory features in the memory bank
             pix_feat = self._prepare_memory_conditioned_features(
@@ -882,7 +901,7 @@ class SAM2Model(torch.nn.Module):
                 high_res_features=high_res_features,
                 multimask_output=multimask_output,
             )
-        return current_out, sam_outputs, high_res_features, pix_feat
+        return sam_outputs, high_res_features, pix_feat
     def _encode_memory_in_output(
         self,
@@ -896,11 +915,10 @@ class SAM2Model(torch.nn.Module):
     ):
         """Run memory encoder on predicted mask to encode it into a new memory feature for future frames."""
         if run_mem_encoder and self.num_maskmem > 0:
-            high_res_masks_for_mem_enc = high_res_masks
             maskmem_features, maskmem_pos_enc = self._encode_new_memory(
                 current_vision_feats=current_vision_feats,
                 feat_sizes=feat_sizes,
-                pred_masks_high_res=high_res_masks_for_mem_enc,
+                pred_masks_high_res=high_res_masks,
                 object_score_logits=object_score_logits,
                 is_mask_from_pts=(point_inputs is not None),
             )
@@ -932,7 +950,7 @@ class SAM2Model(torch.nn.Module):
         prev_sam_mask_logits=None,
     ):
         """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
-        current_out, sam_outputs, _, _ = self._track_step(
+        sam_outputs, _, _ = self._track_step(
             frame_idx,
             is_init_cond_frame,
             current_vision_feats,
@@ -947,9 +965,11 @@ class SAM2Model(torch.nn.Module):
         )
         _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
-        current_out["pred_masks"] = low_res_masks
-        current_out["pred_masks_high_res"] = high_res_masks
-        current_out["obj_ptr"] = obj_ptr
+        current_out = {
+            "pred_masks": low_res_masks,
+            "pred_masks_high_res": high_res_masks,
+            "obj_ptr": obj_ptr,
+        }
         if not self.training:
             # Only add this in inference (to avoid unused param in activation checkpointing;
             # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
@@ -980,7 +1000,7 @@ class SAM2Model(torch.nn.Module):
     @staticmethod
     def _apply_non_overlapping_constraints(pred_masks):
         """Apply non-overlapping constraints to masks, keeping the highest scoring object per location."""
-        batch_size = pred_masks.size(0)
+        batch_size = pred_masks.shape[0]
         if batch_size == 1:
             return pred_masks
@@ -1004,3 +1024,4 @@ class SAM2Model(torch.nn.Module):
         self.image_size = imgsz[0]
         self.sam_prompt_encoder.input_image_size = imgsz
         self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16
+        self.sam_image_embedding_size = self.image_size // self.backbone_stride  # update image embedding size

dgenerate-ultralytics-headless 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.137py3-none-any.whl → 8.3.224py3-none-any.whl