PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.196__py3-none-any.whl → 8.3.248__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.196py3-none-any.whl → 8.3.248py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (243) hide show

{dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/METADATA +33 -34
dgenerate_ultralytics_headless-8.3.248.dist-info/RECORD +298 -0
tests/__init__.py +5 -7
tests/conftest.py +8 -15
tests/test_cli.py +8 -10
tests/test_cuda.py +9 -10
tests/test_engine.py +29 -2
tests/test_exports.py +69 -21
tests/test_integrations.py +8 -11
tests/test_python.py +109 -71
tests/test_solutions.py +170 -159
ultralytics/__init__.py +27 -9
ultralytics/cfg/__init__.py +57 -64
ultralytics/cfg/datasets/Argoverse.yaml +7 -6
ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
ultralytics/cfg/datasets/ImageNet.yaml +1 -1
ultralytics/cfg/datasets/Objects365.yaml +19 -15
ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
ultralytics/cfg/datasets/VOC.yaml +19 -21
ultralytics/cfg/datasets/VisDrone.yaml +5 -5
ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
ultralytics/cfg/datasets/coco-pose.yaml +24 -2
ultralytics/cfg/datasets/coco.yaml +2 -2
ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
ultralytics/cfg/datasets/dog-pose.yaml +28 -0
ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
ultralytics/cfg/datasets/dota8.yaml +2 -2
ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
ultralytics/cfg/datasets/kitti.yaml +27 -0
ultralytics/cfg/datasets/lvis.yaml +7 -7
ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
ultralytics/cfg/datasets/xView.yaml +16 -16
ultralytics/cfg/default.yaml +96 -94
ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
ultralytics/cfg/models/v6/yolov6.yaml +1 -1
ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
ultralytics/cfg/trackers/botsort.yaml +16 -17
ultralytics/cfg/trackers/bytetrack.yaml +9 -11
ultralytics/data/__init__.py +4 -4
ultralytics/data/annotator.py +3 -4
ultralytics/data/augment.py +286 -476
ultralytics/data/base.py +18 -26
ultralytics/data/build.py +151 -26
ultralytics/data/converter.py +38 -50
ultralytics/data/dataset.py +47 -75
ultralytics/data/loaders.py +42 -49
ultralytics/data/split.py +5 -6
ultralytics/data/split_dota.py +8 -15
ultralytics/data/utils.py +41 -45
ultralytics/engine/exporter.py +462 -462
ultralytics/engine/model.py +150 -191
ultralytics/engine/predictor.py +30 -40
ultralytics/engine/results.py +177 -311
ultralytics/engine/trainer.py +193 -120
ultralytics/engine/tuner.py +77 -63
ultralytics/engine/validator.py +39 -22
ultralytics/hub/__init__.py +16 -19
ultralytics/hub/auth.py +6 -12
ultralytics/hub/google/__init__.py +7 -10
ultralytics/hub/session.py +15 -25
ultralytics/hub/utils.py +5 -8
ultralytics/models/__init__.py +1 -1
ultralytics/models/fastsam/__init__.py +1 -1
ultralytics/models/fastsam/model.py +8 -10
ultralytics/models/fastsam/predict.py +19 -30
ultralytics/models/fastsam/utils.py +1 -2
ultralytics/models/fastsam/val.py +5 -7
ultralytics/models/nas/__init__.py +1 -1
ultralytics/models/nas/model.py +5 -8
ultralytics/models/nas/predict.py +7 -9
ultralytics/models/nas/val.py +1 -2
ultralytics/models/rtdetr/__init__.py +1 -1
ultralytics/models/rtdetr/model.py +7 -8
ultralytics/models/rtdetr/predict.py +15 -19
ultralytics/models/rtdetr/train.py +10 -13
ultralytics/models/rtdetr/val.py +21 -23
ultralytics/models/sam/__init__.py +15 -2
ultralytics/models/sam/amg.py +14 -20
ultralytics/models/sam/build.py +26 -19
ultralytics/models/sam/build_sam3.py +377 -0
ultralytics/models/sam/model.py +29 -32
ultralytics/models/sam/modules/blocks.py +83 -144
ultralytics/models/sam/modules/decoders.py +22 -40
ultralytics/models/sam/modules/encoders.py +44 -101
ultralytics/models/sam/modules/memory_attention.py +16 -30
ultralytics/models/sam/modules/sam.py +206 -79
ultralytics/models/sam/modules/tiny_encoder.py +64 -83
ultralytics/models/sam/modules/transformer.py +18 -28
ultralytics/models/sam/modules/utils.py +174 -50
ultralytics/models/sam/predict.py +2268 -366
ultralytics/models/sam/sam3/__init__.py +3 -0
ultralytics/models/sam/sam3/decoder.py +546 -0
ultralytics/models/sam/sam3/encoder.py +529 -0
ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
ultralytics/models/sam/sam3/model_misc.py +199 -0
ultralytics/models/sam/sam3/necks.py +129 -0
ultralytics/models/sam/sam3/sam3_image.py +339 -0
ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
ultralytics/models/sam/sam3/vitdet.py +547 -0
ultralytics/models/sam/sam3/vl_combiner.py +160 -0
ultralytics/models/utils/loss.py +14 -26
ultralytics/models/utils/ops.py +13 -17
ultralytics/models/yolo/__init__.py +1 -1
ultralytics/models/yolo/classify/predict.py +9 -12
ultralytics/models/yolo/classify/train.py +15 -41
ultralytics/models/yolo/classify/val.py +34 -32
ultralytics/models/yolo/detect/predict.py +8 -11
ultralytics/models/yolo/detect/train.py +13 -32
ultralytics/models/yolo/detect/val.py +75 -63
ultralytics/models/yolo/model.py +37 -53
ultralytics/models/yolo/obb/predict.py +5 -14
ultralytics/models/yolo/obb/train.py +11 -14
ultralytics/models/yolo/obb/val.py +42 -39
ultralytics/models/yolo/pose/__init__.py +1 -1
ultralytics/models/yolo/pose/predict.py +7 -22
ultralytics/models/yolo/pose/train.py +10 -22
ultralytics/models/yolo/pose/val.py +40 -59
ultralytics/models/yolo/segment/predict.py +16 -20
ultralytics/models/yolo/segment/train.py +3 -12
ultralytics/models/yolo/segment/val.py +106 -56
ultralytics/models/yolo/world/train.py +12 -16
ultralytics/models/yolo/world/train_world.py +11 -34
ultralytics/models/yolo/yoloe/__init__.py +7 -7
ultralytics/models/yolo/yoloe/predict.py +16 -23
ultralytics/models/yolo/yoloe/train.py +31 -56
ultralytics/models/yolo/yoloe/train_seg.py +5 -10
ultralytics/models/yolo/yoloe/val.py +16 -21
ultralytics/nn/__init__.py +7 -7
ultralytics/nn/autobackend.py +152 -80
ultralytics/nn/modules/__init__.py +60 -60
ultralytics/nn/modules/activation.py +4 -6
ultralytics/nn/modules/block.py +133 -217
ultralytics/nn/modules/conv.py +52 -97
ultralytics/nn/modules/head.py +64 -116
ultralytics/nn/modules/transformer.py +79 -89
ultralytics/nn/modules/utils.py +16 -21
ultralytics/nn/tasks.py +111 -156
ultralytics/nn/text_model.py +40 -67
ultralytics/solutions/__init__.py +12 -12
ultralytics/solutions/ai_gym.py +11 -17
ultralytics/solutions/analytics.py +15 -16
ultralytics/solutions/config.py +5 -6
ultralytics/solutions/distance_calculation.py +10 -13
ultralytics/solutions/heatmap.py +7 -13
ultralytics/solutions/instance_segmentation.py +5 -8
ultralytics/solutions/object_blurrer.py +7 -10
ultralytics/solutions/object_counter.py +12 -19
ultralytics/solutions/object_cropper.py +8 -14
ultralytics/solutions/parking_management.py +33 -31
ultralytics/solutions/queue_management.py +10 -12
ultralytics/solutions/region_counter.py +9 -12
ultralytics/solutions/security_alarm.py +15 -20
ultralytics/solutions/similarity_search.py +13 -17
ultralytics/solutions/solutions.py +75 -74
ultralytics/solutions/speed_estimation.py +7 -10
ultralytics/solutions/streamlit_inference.py +4 -7
ultralytics/solutions/templates/similarity-search.html +7 -18
ultralytics/solutions/trackzone.py +7 -10
ultralytics/solutions/vision_eye.py +5 -8
ultralytics/trackers/__init__.py +1 -1
ultralytics/trackers/basetrack.py +3 -5
ultralytics/trackers/bot_sort.py +10 -27
ultralytics/trackers/byte_tracker.py +14 -30
ultralytics/trackers/track.py +3 -6
ultralytics/trackers/utils/gmc.py +11 -22
ultralytics/trackers/utils/kalman_filter.py +37 -48
ultralytics/trackers/utils/matching.py +12 -15
ultralytics/utils/__init__.py +116 -116
ultralytics/utils/autobatch.py +2 -4
ultralytics/utils/autodevice.py +17 -18
ultralytics/utils/benchmarks.py +70 -70
ultralytics/utils/callbacks/base.py +8 -10
ultralytics/utils/callbacks/clearml.py +5 -13
ultralytics/utils/callbacks/comet.py +32 -46
ultralytics/utils/callbacks/dvc.py +13 -18
ultralytics/utils/callbacks/mlflow.py +4 -5
ultralytics/utils/callbacks/neptune.py +7 -15
ultralytics/utils/callbacks/platform.py +314 -38
ultralytics/utils/callbacks/raytune.py +3 -4
ultralytics/utils/callbacks/tensorboard.py +23 -31
ultralytics/utils/callbacks/wb.py +10 -13
ultralytics/utils/checks.py +151 -87
ultralytics/utils/cpu.py +3 -8
ultralytics/utils/dist.py +19 -15
ultralytics/utils/downloads.py +29 -41
ultralytics/utils/errors.py +6 -14
ultralytics/utils/events.py +2 -4
ultralytics/utils/export/__init__.py +7 -0
ultralytics/utils/{export.py → export/engine.py} +16 -16
ultralytics/utils/export/imx.py +325 -0
ultralytics/utils/export/tensorflow.py +231 -0
ultralytics/utils/files.py +24 -28
ultralytics/utils/git.py +9 -11
ultralytics/utils/instance.py +30 -51
ultralytics/utils/logger.py +212 -114
ultralytics/utils/loss.py +15 -24
ultralytics/utils/metrics.py +131 -160
ultralytics/utils/nms.py +21 -30
ultralytics/utils/ops.py +107 -165
ultralytics/utils/patches.py +33 -21
ultralytics/utils/plotting.py +122 -119
ultralytics/utils/tal.py +28 -44
ultralytics/utils/torch_utils.py +70 -187
ultralytics/utils/tqdm.py +20 -20
ultralytics/utils/triton.py +13 -19
ultralytics/utils/tuner.py +17 -5
dgenerate_ultralytics_headless-8.3.196.dist-info/RECORD +0 -281
{dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.196.dist-info → dgenerate_ultralytics_headless-8.3.248.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -13,7 +13,7 @@ from torch.nn.init import trunc_normal_
 from ultralytics.nn.modules import MLP
 from ultralytics.utils import LOGGER
-from .blocks import SAM2TwoWayTransformer
+from .blocks import SAM2TwoWayTransformer, TwoWayTransformer
 from .decoders import MaskDecoder, SAM2MaskDecoder
 from .encoders import ImageEncoderViT, PromptEncoder
 from .utils import get_1d_sine_pe, select_closest_cond_frames
@@ -23,11 +23,10 @@ NO_OBJ_SCORE = -1024.0
 class SAMModel(nn.Module):
-    """
-    Segment Anything Model (SAM) for object segmentation tasks.
+    """Segment Anything Model (SAM) for object segmentation tasks.
-    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
-    and input prompts.
+    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images and input
+    prompts.
     Attributes:
         mask_threshold (float): Threshold value for mask prediction.
@@ -61,8 +60,7 @@ class SAMModel(nn.Module):
         pixel_mean: list[float] = (123.675, 116.28, 103.53),
         pixel_std: list[float] = (58.395, 57.12, 57.375),
     ) -> None:
-        """
-        Initialize the SAMModel class to predict object masks from an image and input prompts.
+        """Initialize the SAMModel class to predict object masks from an image and input prompts.
         Args:
             image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
@@ -71,13 +69,6 @@ class SAMModel(nn.Module):
             pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
             pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.
-        Examples:
-            >>> image_encoder = ImageEncoderViT(...)
-            >>> prompt_encoder = PromptEncoder(...)
-            >>> mask_decoder = MaskDecoder(...)
-            >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
-            >>> # Further usage depends on SAMPredictor class
         Notes:
             All forward() operations moved to SAMPredictor.
         """
@@ -98,11 +89,10 @@ class SAMModel(nn.Module):
 class SAM2Model(torch.nn.Module):
-    """
-    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.
+    """SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.
-    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
-    for temporal consistency and efficient tracking of objects across frames.
+    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms for temporal
+    consistency and efficient tracking of objects across frames.
     Attributes:
         mask_threshold (float): Threshold value for mask prediction.
@@ -136,24 +126,24 @@ class SAM2Model(torch.nn.Module):
         use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
         no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
         max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
-        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
-            first frame.
-        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
-            conditioning frames.
+        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the first
+            frame.
+        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial conditioning
+            frames.
         multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
         multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
         multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
         use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
         iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
         memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
-        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
-            memory encoder during evaluation.
+        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in memory
+            encoder during evaluation.
         sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
         sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
-        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
-            with clicks during evaluation.
-        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
-            prompt encoder and mask decoder on frames with mask input.
+        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames with
+            clicks during evaluation.
+        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM prompt
+            encoder and mask decoder on frames with mask input.
     Methods:
         forward_image: Process image batch through encoder to extract multi-level features.
@@ -208,8 +198,7 @@ class SAM2Model(torch.nn.Module):
         sam_mask_decoder_extra_args=None,
         compile_image_encoder: bool = False,
     ):
-        """
-        Initialize the SAM2Model for video object segmentation with memory-based tracking.
+        """Initialize the SAM2Model for video object segmentation with memory-based tracking.
         Args:
             image_encoder (nn.Module): Visual encoder for extracting image features.
@@ -220,35 +209,35 @@ class SAM2Model(torch.nn.Module):
             backbone_stride (int): Stride of the image backbone output.
             sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
             sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
-            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
-                with clicks during evaluation.
+            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames with
+                clicks during evaluation.
             use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                 prompt encoder and mask decoder on frames with mask input.
             max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
-            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
-                first frame.
+            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the first
+                frame.
             use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
-            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
-                conditioning frames.
+            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial conditioning
+                frames.
             multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
             multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
             multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
             use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
             iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
             memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
-            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
-                memory encoder during evaluation.
+            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in memory
+                encoder during evaluation.
             use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
             max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                 cross-attention.
-            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
-                the encoder.
+            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in the
+                encoder.
             proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                 encoding in object pointers.
             use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
                 in the object pointers.
-            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
-                during evaluation.
+            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
+                evaluation.
             pred_obj_scores (bool): Whether to predict if there is an object in the frame.
             pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
             fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
@@ -257,15 +246,6 @@ class SAM2Model(torch.nn.Module):
             no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
             sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
             compile_image_encoder (bool): Whether to compile the image encoder for faster inference.
-        Examples:
-            >>> image_encoder = ImageEncoderViT(...)
-            >>> memory_attention = SAM2TwoWayTransformer(...)
-            >>> memory_encoder = nn.Sequential(...)
-            >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
-            >>> image_batch = torch.rand(1, 3, 512, 512)
-            >>> features = model.forward_image(image_batch)
-            >>> track_results = model.track_step(0, True, features, None, None, None, {})
         """
         super().__init__()
@@ -349,6 +329,7 @@ class SAM2Model(torch.nn.Module):
         self._build_sam_heads()
         self.max_cond_frames_in_attn = max_cond_frames_in_attn
+        self.add_all_frames_to_correct_as_cond = True
         # Model compilation
         if compile_image_encoder:
@@ -428,25 +409,23 @@ class SAM2Model(torch.nn.Module):
         high_res_features=None,
         multimask_output=False,
     ):
-        """
-        Forward pass through SAM prompt encoders and mask heads.
+        """Forward pass through SAM prompt encoders and mask heads.
         This method processes image features and optional point/mask inputs to generate object masks and scores.
         Args:
             backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
             point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
-                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
-                    pixel-unit coordinates in (x, y) format for P input points.
-                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
-                    0 means negative clicks, and -1 means padding.
-            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
-                same spatial size as the image.
-            high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes
-                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
-                for SAM decoder.
-            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
-                output only 1 mask and its IoU estimate.
+            'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute pixel-unit coordinates in
+                (x, y) format for P input points.
+            'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks, 0 means negative
+                clicks, and -1 means padding.
+            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the same spatial
+                size as the image.
+            high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes (B, C, 4*H, 4*W) and (B,
+                C, 2*H, 2*W) respectively, used as high-resolution feature maps for SAM decoder.
+            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False, output only 1
+                mask and its IoU estimate.
         Returns:
             low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
@@ -472,7 +451,7 @@ class SAM2Model(torch.nn.Module):
             ...     object_score_logits,
             ... ) = results
         """
-        B = backbone_features.size(0)
+        B = backbone_features.shape[0]
         device = backbone_features.device
         assert backbone_features.size(1) == self.sam_prompt_embed_dim
         assert backbone_features.size(2) == self.sam_image_embedding_size
@@ -482,7 +461,7 @@ class SAM2Model(torch.nn.Module):
         if point_inputs is not None:
             sam_point_coords = point_inputs["point_coords"]
             sam_point_labels = point_inputs["point_labels"]
-            assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+            assert sam_point_coords.shape[0] == B and sam_point_labels.shape[0] == B
         else:
             # If no points are provide, pad with an empty point (with label -1)
             sam_point_coords = torch.zeros(B, 1, 2, device=device, dtype=backbone_features.dtype)
@@ -495,7 +474,7 @@ class SAM2Model(torch.nn.Module):
             assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
             if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
                 sam_mask_prompt = F.interpolate(
-                    mask_inputs.float(),
+                    mask_inputs.to(backbone_features.dtype),
                     size=self.sam_prompt_encoder.mask_input_size,
                     align_corners=False,
                     mode="bilinear",
@@ -585,15 +564,15 @@ class SAM2Model(torch.nn.Module):
             antialias=True,  # use antialias for downsampling
         )
         # a dummy IoU prediction of all 1's under mask input
-        ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
+        ious = mask_inputs.new_ones(mask_inputs.shape[0], 1).float()
         if not self.use_obj_ptrs_in_encoder or backbone_features is None or high_res_features is None:
             # all zeros as a dummy object pointer (of shape [B, C])
-            obj_ptr = torch.zeros(mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device)
+            obj_ptr = torch.zeros(mask_inputs.shape[0], self.hidden_dim, device=mask_inputs.device)
         else:
             # produce an object pointer using the SAM decoder from the mask input
             _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
                 backbone_features=backbone_features,
-                mask_inputs=self.mask_downsample(mask_inputs_float),
+                mask_inputs=self.mask_downsample(mask_inputs_float.to(backbone_features.dtype)),
                 high_res_features=high_res_features,
             )
         # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
@@ -628,8 +607,14 @@ class SAM2Model(torch.nn.Module):
             backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(backbone_out["backbone_fpn"][1])
         return backbone_out
-    def _prepare_backbone_features(self, backbone_out):
+    def _prepare_backbone_features(self, backbone_out, batch=1):
         """Prepare and flatten visual features from the image backbone output for further processing."""
+        if batch > 1:  # expand features if there's more than one prompt
+            backbone_out = {
+                **backbone_out,
+                "backbone_fpn": [feat.expand(batch, -1, -1, -1) for feat in backbone_out["backbone_fpn"]],
+                "vision_pos_enc": [pos.expand(batch, -1, -1, -1) for pos in backbone_out["vision_pos_enc"]],
+            }
         assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
         assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
@@ -640,7 +625,6 @@ class SAM2Model(torch.nn.Module):
         # flatten NxCxHxW to HWxNxC
         vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
         vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
         return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
     def _prepare_memory_conditioned_features(
@@ -712,7 +696,7 @@ class SAM2Model(torch.nn.Module):
                     continue  # skip padding frames
                 # "maskmem_features" might have been offloaded to CPU in demo use cases,
                 # so we load it back to inference device (it's a no-op if it's already on device).
-                feats = prev["maskmem_features"].to(device=device, non_blocking=True)
+                feats = prev["maskmem_features"].to(device=device, non_blocking=device.type == "cuda")
                 to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
                 # Spatial positional encoding (it might have been offloaded to CPU in eval)
                 maskmem_enc = prev["maskmem_pos_enc"][-1].to(device=device)
@@ -803,7 +787,7 @@ class SAM2Model(torch.nn.Module):
             memory_pos=memory_pos_embed,
             num_obj_ptr_tokens=num_obj_ptr_tokens,
         )
-        # reshape the output (HW)BC => BCHW
+        # Reshape output (HW)BC => BCHW
         pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
         return pix_feat_with_mem
@@ -840,7 +824,6 @@ class SAM2Model(torch.nn.Module):
             mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
         maskmem_out = self.memory_encoder(pix_feat, mask_for_mem, skip_mask_sigmoid=True)  # sigmoid already applied
         maskmem_features = maskmem_out["vision_features"]
-        maskmem_pos_enc = maskmem_out["vision_pos_enc"]
         # add a no-object embedding to the spatial memory to indicate that the frame
         # is predicted to be occluded (i.e. no object is appearing in the frame)
         if self.no_obj_embed_spatial is not None:
@@ -849,7 +832,7 @@ class SAM2Model(torch.nn.Module):
                 ..., None, None
             ].expand(*maskmem_features.shape)
-        return maskmem_features, maskmem_pos_enc
+        return maskmem_features, maskmem_out["vision_pos_enc"]
     def _track_step(
         self,
@@ -881,7 +864,7 @@ class SAM2Model(torch.nn.Module):
             pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
             sam_outputs = self._use_mask_as_output(mask_inputs, pix_feat, high_res_features)
         else:
-            # fused the visual feature with previous memory features in the memory bank
+            # Fuse visual features with previous memory features in the memory bank
             pix_feat = self._prepare_memory_conditioned_features(
                 frame_idx=frame_idx,
                 is_init_cond_frame=is_init_cond_frame,
@@ -1006,7 +989,7 @@ class SAM2Model(torch.nn.Module):
     @staticmethod
     def _apply_non_overlapping_constraints(pred_masks):
         """Apply non-overlapping constraints to masks, keeping the highest scoring object per location."""
-        batch_size = pred_masks.size(0)
+        batch_size = pred_masks.shape[0]
         if batch_size == 1:
             return pred_masks
@@ -1027,7 +1010,151 @@ class SAM2Model(torch.nn.Module):
     def set_imgsz(self, imgsz):
         """Set image size to make model compatible with different image sizes."""
+        if hasattr(self.image_encoder, "set_imgsz"):
+            self.image_encoder.set_imgsz(imgsz)
         self.image_size = imgsz[0]
         self.sam_prompt_encoder.input_image_size = imgsz
-        self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16
+        self.sam_prompt_encoder.image_embedding_size = [
+            x // self.backbone_stride for x in imgsz
+        ]  # fixed ViT patch size of 16
+        self.sam_prompt_encoder.mask_input_size = [
+            x // self.backbone_stride * 4 for x in imgsz
+        ]  # fixed ViT patch size of 16
         self.sam_image_embedding_size = self.image_size // self.backbone_stride  # update image embedding size
+class SAM3Model(SAM2Model):
+    """SAM3Model class for Segment Anything Model 3 with memory-based video object segmentation capabilities."""
+    def __init__(
+        self,
+        image_encoder,
+        memory_attention,
+        memory_encoder,
+        num_maskmem=7,
+        image_size=1008,
+        backbone_stride=14,
+        sigmoid_scale_for_mem_enc=1,
+        sigmoid_bias_for_mem_enc=0,
+        binarize_mask_from_pts_for_mem_enc=False,
+        use_mask_input_as_output_without_sam=False,
+        max_cond_frames_in_attn=-1,
+        directly_add_no_mem_embed=False,
+        use_high_res_features_in_sam=False,
+        multimask_output_in_sam=False,
+        multimask_min_pt_num=1,
+        multimask_max_pt_num=1,
+        multimask_output_for_tracking=False,
+        use_multimask_token_for_obj_ptr: bool = False,
+        iou_prediction_use_sigmoid=False,
+        memory_temporal_stride_for_eval=1,
+        non_overlap_masks_for_mem_enc=False,
+        use_obj_ptrs_in_encoder=False,
+        max_obj_ptrs_in_encoder=16,
+        add_tpos_enc_to_obj_ptrs=True,
+        proj_tpos_enc_in_obj_ptrs=False,
+        use_signed_tpos_enc_to_obj_ptrs=False,
+        only_obj_ptrs_in_the_past_for_eval=False,
+        pred_obj_scores: bool = False,
+        pred_obj_scores_mlp: bool = False,
+        fixed_no_obj_ptr: bool = False,
+        soft_no_obj_ptr: bool = False,
+        use_mlp_for_obj_ptr_proj: bool = False,
+        no_obj_embed_spatial: bool = False,
+        sam_mask_decoder_extra_args=None,
+        compile_image_encoder: bool = False,
+    ):
+        """SAM3Model class for Segment Anything Model 3 with memory-based video object segmentation capabilities."""
+        super().__init__(
+            image_encoder,
+            memory_attention,
+            memory_encoder,
+            num_maskmem,
+            image_size,
+            backbone_stride,
+            sigmoid_scale_for_mem_enc,
+            sigmoid_bias_for_mem_enc,
+            binarize_mask_from_pts_for_mem_enc,
+            use_mask_input_as_output_without_sam,
+            max_cond_frames_in_attn,
+            directly_add_no_mem_embed,
+            use_high_res_features_in_sam,
+            multimask_output_in_sam,
+            multimask_min_pt_num,
+            multimask_max_pt_num,
+            multimask_output_for_tracking,
+            use_multimask_token_for_obj_ptr,
+            iou_prediction_use_sigmoid,
+            memory_temporal_stride_for_eval,
+            non_overlap_masks_for_mem_enc,
+            use_obj_ptrs_in_encoder,
+            max_obj_ptrs_in_encoder,
+            add_tpos_enc_to_obj_ptrs,
+            proj_tpos_enc_in_obj_ptrs,
+            use_signed_tpos_enc_to_obj_ptrs,
+            only_obj_ptrs_in_the_past_for_eval,
+            pred_obj_scores,
+            pred_obj_scores_mlp,
+            fixed_no_obj_ptr,
+            soft_no_obj_ptr,
+            use_mlp_for_obj_ptr_proj,
+            no_obj_embed_spatial,
+            sam_mask_decoder_extra_args,
+            compile_image_encoder,
+        )
+        self.sam_mask_decoder = SAM2MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=self.sam_prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=self.sam_prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+            use_high_res_features=self.use_high_res_features_in_sam,
+            iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
+            pred_obj_scores=self.pred_obj_scores,
+            pred_obj_scores_mlp=self.pred_obj_scores_mlp,
+            use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
+            **(self.sam_mask_decoder_extra_args or {}),
+        )
+    def forward_image(self, img_batch: torch.Tensor):
+        """Process image batch through encoder to extract multi-level features for SAM model."""
+        backbone_out = self.image_encoder.forward_image_sam2(img_batch)
+        if self.use_high_res_features_in_sam:
+            # precompute projected level 0 and level 1 features in SAM decoder
+            # to avoid running it again on every SAM click
+            backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(backbone_out["backbone_fpn"][0])
+            backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(backbone_out["backbone_fpn"][1])
+        return backbone_out
+    def set_imgsz(self, imgsz: tuple[int, int]):
+        """Set the image size for the model and mask downsampler."""
+        super().set_imgsz(imgsz)
+        self.memory_encoder.mask_downsampler.interpol_size = [size // 14 * 16 for size in imgsz]
+    @staticmethod
+    def _suppress_shrinked_masks(pred_masks, new_pred_masks, shrink_threshold=0.3):
+        """Suppress masks that shrink in area after applying pixelwise non-overlapping constraints."""
+        area_before = (pred_masks > 0).sum(dim=(-1, -2))
+        area_after = (new_pred_masks > 0).sum(dim=(-1, -2))
+        area_before = torch.clamp(area_before, min=1.0)
+        area_ratio = area_after / area_before
+        keep = area_ratio >= shrink_threshold
+        keep_mask = keep[..., None, None].expand_as(pred_masks)
+        pred_masks_after = torch.where(keep_mask, pred_masks, torch.clamp(pred_masks, max=-10.0))
+        return pred_masks_after
+    def _suppress_object_pw_area_shrinkage(self, pred_masks):
+        """This function suppresses masks that shrink in area after applying pixelwise non-overlapping constraints. Note
+        that the final output can still be overlapping.
+        """
+        # Apply pixel-wise non-overlapping constraint based on mask scores
+        pixel_level_non_overlapping_masks = self._apply_non_overlapping_constraints(pred_masks)
+        # Fully suppress masks with high shrinkage (probably noisy) based on the pixel wise non-overlapping constraints
+        # NOTE: The output of this function can be a no op if none of the masks shrink by a large factor.
+        pred_masks = self._suppress_shrinked_masks(pred_masks, pixel_level_non_overlapping_masks)
+        return pred_masks

dgenerate-ultralytics-headless 8.3.196__py3-none-any.whl → 8.3.248__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.196py3-none-any.whl → 8.3.248py3-none-any.whl