PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.237__py3-none-any.whl → 8.3.240__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.237py3-none-any.whl → 8.3.240py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/METADATA +2 -1
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/RECORD +105 -106
tests/test_exports.py +3 -1
tests/test_python.py +2 -2
tests/test_solutions.py +6 -6
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +4 -4
ultralytics/cfg/datasets/Argoverse.yaml +7 -6
ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
ultralytics/cfg/datasets/VOC.yaml +15 -16
ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
ultralytics/cfg/datasets/dota8.yaml +2 -2
ultralytics/cfg/datasets/kitti.yaml +1 -1
ultralytics/cfg/datasets/xView.yaml +16 -16
ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
ultralytics/data/augment.py +1 -1
ultralytics/data/base.py +4 -2
ultralytics/data/build.py +4 -4
ultralytics/data/loaders.py +17 -12
ultralytics/data/utils.py +4 -4
ultralytics/engine/exporter.py +24 -16
ultralytics/engine/predictor.py +5 -4
ultralytics/engine/results.py +12 -13
ultralytics/engine/trainer.py +2 -2
ultralytics/engine/tuner.py +2 -3
ultralytics/engine/validator.py +2 -2
ultralytics/models/fastsam/model.py +2 -2
ultralytics/models/fastsam/predict.py +2 -3
ultralytics/models/fastsam/val.py +4 -4
ultralytics/models/rtdetr/predict.py +2 -3
ultralytics/models/rtdetr/val.py +5 -4
ultralytics/models/sam/build.py +5 -5
ultralytics/models/sam/build_sam3.py +9 -6
ultralytics/models/sam/model.py +1 -1
ultralytics/models/sam/modules/sam.py +10 -5
ultralytics/models/sam/modules/utils.py +8 -3
ultralytics/models/sam/predict.py +53 -62
ultralytics/models/sam/sam3/encoder.py +4 -4
ultralytics/models/sam/sam3/geometry_encoders.py +3 -3
ultralytics/models/sam/sam3/necks.py +17 -17
ultralytics/models/sam/sam3/sam3_image.py +3 -21
ultralytics/models/sam/sam3/vl_combiner.py +1 -6
ultralytics/models/yolo/classify/val.py +1 -1
ultralytics/models/yolo/detect/train.py +1 -1
ultralytics/models/yolo/detect/val.py +7 -7
ultralytics/models/yolo/obb/val.py +1 -1
ultralytics/models/yolo/pose/val.py +1 -1
ultralytics/models/yolo/segment/val.py +1 -1
ultralytics/nn/autobackend.py +9 -9
ultralytics/nn/modules/block.py +1 -1
ultralytics/nn/tasks.py +3 -3
ultralytics/nn/text_model.py +2 -7
ultralytics/solutions/ai_gym.py +1 -1
ultralytics/solutions/analytics.py +6 -6
ultralytics/solutions/config.py +1 -1
ultralytics/solutions/distance_calculation.py +1 -1
ultralytics/solutions/object_counter.py +1 -1
ultralytics/solutions/object_cropper.py +3 -6
ultralytics/solutions/parking_management.py +21 -17
ultralytics/solutions/queue_management.py +5 -5
ultralytics/solutions/region_counter.py +2 -2
ultralytics/solutions/security_alarm.py +1 -1
ultralytics/solutions/solutions.py +45 -22
ultralytics/solutions/speed_estimation.py +1 -1
ultralytics/trackers/basetrack.py +1 -1
ultralytics/trackers/bot_sort.py +4 -3
ultralytics/trackers/byte_tracker.py +4 -4
ultralytics/trackers/utils/gmc.py +6 -7
ultralytics/trackers/utils/kalman_filter.py +2 -1
ultralytics/trackers/utils/matching.py +4 -3
ultralytics/utils/__init__.py +12 -3
ultralytics/utils/benchmarks.py +2 -2
ultralytics/utils/callbacks/tensorboard.py +19 -25
ultralytics/utils/checks.py +2 -1
ultralytics/utils/downloads.py +1 -1
ultralytics/utils/export/tensorflow.py +16 -2
ultralytics/utils/files.py +13 -12
ultralytics/utils/logger.py +62 -27
ultralytics/utils/metrics.py +1 -1
ultralytics/utils/ops.py +6 -6
ultralytics/utils/patches.py +3 -3
ultralytics/utils/plotting.py +18 -23
ultralytics/utils/tuner.py +1 -1
ultralytics/models/sam/sam3/tokenizer_ve.py +0 -242
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -110,10 +110,12 @@ class Predictor(BasePredictor):
         """Preprocess the input image for model inference.
         This method prepares the input image by applying transformations and normalization. It supports both
-        torch.Tensor and list of np.ndarray as input formats.
+        torch.Tensor and list of np.ndarray as input formats. For OpenCV-loaded images, the input is typically BGR and
+        is converted to RGB during preprocessing.
         Args:
-            im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC numpy arrays.
+            im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or a list of HWC NumPy arrays.
+                NumPy arrays are expected to be in BGR order (as returned by OpenCV) and will be converted to RGB.
         Returns:
             (torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
@@ -534,7 +536,7 @@ class Predictor(BasePredictor):
         Args:
             image (str | np.ndarray): Path to the image file as a string, or a numpy array representing an image read by
-                cv2.
+                cv2 (BGR channel order).
         Raises:
             AssertionError: If more than one image is attempted to be set.
@@ -876,6 +878,7 @@ class SAM2VideoPredictor(SAM2Predictor):
         self.clear_non_cond_mem_around_input = False
         self.clear_non_cond_mem_for_multi_obj = False
         self.callbacks["on_predict_start"].append(self.init_state)
+        self.clear_non_cond_mem = True  # Whether to clear non-conditioning memory periodically
     def get_model(self):
         """Retrieve and configure the model with binarization enabled.
@@ -950,6 +953,7 @@ class SAM2VideoPredictor(SAM2Predictor):
                 run_mem_encoder=True,
             )
             output_dict[storage_key][frame] = current_out
+            self._prune_non_cond_memory(frame)
         # Create slices of per-object outputs for subsequent interaction with each
         # individual object after tracking.
         self._add_output_per_object(frame, current_out, storage_key)
@@ -1244,14 +1248,11 @@ class SAM2VideoPredictor(SAM2Predictor):
             - If `batch` is greater than 1, the features are expanded to fit the batch size.
             - The method leverages the model's `_prepare_backbone_features` method to prepare the backbone features.
         """
-        backbone_out = self.model.forward_image(im)
-        if batch > 1:  # expand features if there's more than one prompt
-            for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
-            for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                pos = pos.expand(batch, -1, -1, -1)
-                backbone_out["vision_pos_enc"][i] = pos
-        _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
+        # check if there's precomputed backbone output
+        backbone_out = getattr(self, "backbone_out", None)
+        if backbone_out is None:
+            backbone_out = self.model.forward_image(im)
+        _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out, batch=batch)
         return vis_feats, vis_pos_embed, feat_sizes
     def _obj_id_to_idx(self, obj_id, inference_state: dict[str, Any] | None = None):
@@ -1831,6 +1832,25 @@ class SAM2VideoPredictor(SAM2Predictor):
         inference_state["frames_already_tracked"].clear()
         inference_state["first_ann_frame_idx"] = None
+    def _prune_non_cond_memory(self, frame_idx, inference_state=None):
+        """Prune old non-conditioning frames to bound memory usage."""
+        if not self.clear_non_cond_mem:
+            return
+        inference_state = inference_state or self.inference_state
+        # Determine window size
+        min_frame = frame_idx - self.model.num_maskmem * self.model.memory_temporal_stride_for_eval
+        output_dict = inference_state["output_dict"]
+        # Prune global non_cond_frame_outputs
+        for f in [k for k in output_dict["non_cond_frame_outputs"] if k < min_frame]:
+            output_dict["non_cond_frame_outputs"].pop(f, None)
+        # Prune per-object non_cond_frame_outputs
+        for obj_output_dict in inference_state.get("output_dict_per_obj", {}).values():
+            for f in [k for k in obj_output_dict["non_cond_frame_outputs"] if k < min_frame]:
+                obj_output_dict["non_cond_frame_outputs"].pop(f, None)
 class SAM2DynamicInteractivePredictor(SAM2Predictor):
     """SAM2DynamicInteractivePredictor extends SAM2Predictor to support dynamic interactions with video frames or a
@@ -2055,11 +2075,12 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
         self.memory_bank.append(consolidated_out)
     def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
-        """Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
-        prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features
-        for all objects in the image. If there is no memory, it will directly add a no-memory embedding to the
-        current vision features. If there is memory, it will use the memory features from previous frames to
-        condition the current vision features using a transformer attention mechanism.
+        """Prepare memory-conditioned features for the current image state.
+        If ``obj_idx`` is provided, features are prepared for a specific prompted object in the image. If ``obj_idx`` is
+        None, features are prepared for all objects. If no memory is available, a no-memory embedding is added to the
+        current vision features. Otherwise, memory from previous frames is used to condition the current vision features
+        via a transformer attention mechanism.
         Args:
             obj_idx (int | None): The index of the object for which to prepare the features.
@@ -2068,8 +2089,8 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             pix_feat_with_mem (torch.Tensor): The memory-conditioned pixel features.
         """
         if len(self.memory_bank) == 0 or isinstance(obj_idx, int):
-            # for initial conditioning frames with, encode them without using any previous memory
-            # directly add no-mem embedding (instead of using the transformer encoder)
+            # For initial conditioning frames, encode without using any previous memory.
+            # Directly add the no-memory embedding (instead of using the transformer encoder).
             pix_feat_with_mem = self.vision_feats[-1] + self.model.no_mem_embed
         else:
             # for inference frames, use the memory features from previous frames
@@ -2081,7 +2102,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
                 memory_pos=memory_pos_embed,
                 num_obj_ptr_tokens=0,  # num_obj_ptr_tokens
             )
-        # reshape the output (HW)BC => BCHW
+        # Reshape output (HW)BC => BCHW
         return pix_feat_with_mem.permute(1, 2, 0).view(
             self._max_obj_num,
             self.model.memory_attention.d_model,
@@ -2145,9 +2166,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             pix_feat = pix_feat.view(-1, self.model.memory_attention.d_model, *self.feat_sizes[-1])
             _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._use_mask_as_output(mask)
         else:
-            # fused the visual feature with previous memory features in the memory bank
+            # Fuse visual features with previous memory features in the memory bank.
             pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
-            # calculate the first feature if adding obj_idx exists(means adding prompts)
+            # If ``obj_idx`` is provided (i.e., prompts are being added), keep only the first feature map.
             pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
             _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
                 backbone_features=pix_feat_with_mem,
@@ -2182,7 +2203,7 @@ class SAM3Predictor(SAM2Predictor):
         self.std = torch.tensor([127.5, 127.5, 127.5]).view(-1, 1, 1).to(self.device)
     def get_model(self):
-        """Retrieve and initialize the Segment Anything Model 2 (SAM2) for image segmentation tasks."""
+        """Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
         from .build_sam3 import build_interactive_sam3  # slow import
         return build_interactive_sam3(self.args.model, compile=self.args.compile)
@@ -2191,16 +2212,11 @@ class SAM3Predictor(SAM2Predictor):
 class SAM3SemanticPredictor(SAM3Predictor):
     """Segment Anything Model 3 (SAM3) Predictor for image segmentation tasks."""
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None, bpe_path=None):
-        """Initialize the SAM3SemanticPredictor with configuration and optional overrides."""
-        super().__init__(cfg, overrides, _callbacks)
-        self.bpe_path = bpe_path
     def get_model(self):
         """Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
         from .build_sam3 import build_sam3_image_model  # slow import
-        return build_sam3_image_model(self.args.model, bpe_path=self.bpe_path, compile=self.args.compile)
+        return build_sam3_image_model(self.args.model, compile=self.args.compile)
     @smart_inference_mode()
     def get_im_features(self, im):
@@ -2428,6 +2444,7 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
                 inference_state=inference_state,
             )
             output_dict[storage_key][frame] = current_out
+            self._prune_non_cond_memory(frame, inference_state=inference_state)
         # Create slices of per-object outputs for subsequent interaction with each
         # individual object after tracking.
         self._add_output_per_object(frame, current_out, storage_key, inference_state=inference_state)
@@ -2437,24 +2454,6 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
         return obj_ids, pred_masks, obj_scores
-    def get_im_features(self, im, batch=1):
-        """A wrapper to get image features, supporting pre-extracted backbone outputs."""
-        if getattr(self, "backbone_out", None):
-            backbone_out = self.backbone_out
-            if batch > 1:  # expand features if there's more than one prompt
-                backbone_out = {
-                    "backbone_fpn": backbone_out["backbone_fpn"].copy(),
-                    "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
-                }
-                for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                    backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
-                for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                    pos = pos.expand(batch, -1, -1, -1)
-                    backbone_out["vision_pos_enc"][i] = pos
-            _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
-            return vis_feats, vis_pos_embed, feat_sizes
-        return super().get_im_features(im, batch)
 class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
     """Segment Anything Model 3 (SAM3) Video Semantic Predictor."""
@@ -2479,7 +2478,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         cfg=DEFAULT_CFG,
         overrides=None,
         _callbacks=None,
-        bpe_path="bpe_simple_vocab_16e6.txt.gz",
         # prob threshold for detection outputs -- only keep detections above this threshold
         # enters NMS and det-to-track matching
         score_threshold_detection=0.5,
@@ -2499,14 +2497,12 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         hotstart_delay=0,
         hotstart_unmatch_thresh=3,
         hotstart_dup_thresh=3,
-        # Whether to suppress masks only within hotstart. If False, we can suppress masks even if they start before hotstart period.
-        suppress_unmatched_only_within_hotstart=True,
-        init_trk_keep_alive=0,
-        max_trk_keep_alive=8,
+        init_trk_keep_alive=30,
+        max_trk_keep_alive=30,
         min_trk_keep_alive=-4,
         # Threshold for suppressing overlapping objects based on recent occlusion
         suppress_overlapping_based_on_recent_occlusion_threshold=0.0,
-        decrease_trk_keep_alive_for_empty_masklets=False,
+        decrease_trk_keep_alive_for_empty_masklets=True,
         o2o_matching_masklets_enable=False,  # Enable hungarian matching to match existing masklets
         suppress_det_close_to_boundary=False,
         fill_hole_area=16,
@@ -2523,7 +2519,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         reconstruction_bbox_det_score=0.0,
     ):
         """Initialize the SAM3VideoSemanticPredictor with configuration and optional overrides."""
-        super().__init__(cfg, overrides, _callbacks, bpe_path=bpe_path)
+        super().__init__(cfg, overrides, _callbacks)
         self.score_threshold_detection = score_threshold_detection
         self.det_nms_thresh = det_nms_thresh
         self.assoc_iou_thresh = assoc_iou_thresh
@@ -2537,7 +2533,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         self.hotstart_delay = hotstart_delay
         self.hotstart_unmatch_thresh = hotstart_unmatch_thresh
         self.hotstart_dup_thresh = hotstart_dup_thresh
-        self.suppress_unmatched_only_within_hotstart = suppress_unmatched_only_within_hotstart
         self.init_trk_keep_alive = init_trk_keep_alive
         self.max_trk_keep_alive = max_trk_keep_alive
         self.min_trk_keep_alive = min_trk_keep_alive
@@ -2662,7 +2657,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
                 ) > 0
         # names = getattr(self.model, "names", [str(i) for i in range(pred_scores.shape[0])])
-        names = dict(enumerate(str(i) for i in range(pred_masks.shape[0])))
+        names = dict(enumerate(str(i) for i in range(pred_boxes.shape[0])))
         results = []
         for masks, boxes, orig_img, img_path in zip([pred_masks], [pred_boxes], orig_imgs, self.batch[0]):
             results.append(Results(orig_img, path=img_path, names=names, masks=masks, boxes=boxes))
@@ -2713,7 +2708,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         metadata = tracker_metadata_new["metadata"]
         removed_obj_ids = metadata["removed_obj_ids"]
         out["removed_obj_ids"] = removed_obj_ids
-        out["suppressed_obj_ids"] = metadata["suppressed_obj_ids"][frame_idx]
         out["frame_stats"] = frame_stats
         if self.masklet_confirmation_enable:
             status = metadata["masklet_confirmation"]["status"]
@@ -3621,7 +3615,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         overlap_pair_to_frame_inds = metadata["overlap_pair_to_frame_inds"]
         # removed_obj_ids: object IDs that are suppressed via hot-start
         removed_obj_ids = metadata["removed_obj_ids"]
-        suppressed_obj_ids = metadata["suppressed_obj_ids"][frame_idx]
         obj_ids_newly_removed = set()  # object IDs to be newly removed on this frame
         hotstart_diff = frame_idx - self.hotstart_delay if not reverse else frame_idx + self.hotstart_delay
@@ -3671,12 +3664,12 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
                     )
             if (
                 trk_keep_alive[obj_id] <= 0  # Object has not been matched for too long
-                and not self.suppress_unmatched_only_within_hotstart
                 and obj_id not in removed_obj_ids
                 and obj_id not in obj_ids_newly_removed
             ):
-                LOGGER.debug(f"Suppressing object {obj_id} at frame {frame_idx}, due to being unmatched")
-                suppressed_obj_ids.add(obj_id)
+                LOGGER.debug(f"Removing object {obj_id} at frame {frame_idx}, due to being unmatched")
+                # directly removed the object instead of suppressing it
+                obj_ids_newly_removed.add(obj_id)
         # Step 3: removed tracks that overlaps with another track for `hotstart_dup_thresh` frames
         # a) find overlaps tracks -- we consider overlap if they match to the same detection
@@ -3855,8 +3848,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
             "trk_keep_alive": defaultdict(int),  # This is used only for object suppression not for removal
             "overlap_pair_to_frame_inds": defaultdict(list),
             "removed_obj_ids": set(),
-            # frame_idx --> set of objects with suppressed outputs, but still continue to be tracked
-            "suppressed_obj_ids": defaultdict(set),
         }
         if self.masklet_confirmation_enable:
             # all the following are np.ndarray with the same shape as `obj_ids_all_gpu`

ultralytics/models/sam/sam3/encoder.py CHANGED Viewed

@@ -171,7 +171,7 @@ class TransformerEncoderLayer(nn.Module):
             assert tgt.shape[0] % 2 == 0
             other_tgt = tgt[tgt.shape[0] // 2 :]
             tgt = tgt[: tgt.shape[0] // 2]
-        tgt2 = self.norm1(tgt)
+        tgt2 = self.norm1(tgt).contiguous()
         q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
         tgt = tgt + self.dropout1(tgt2)
@@ -179,13 +179,13 @@ class TransformerEncoderLayer(nn.Module):
             # Recombine
             tgt = torch.cat((tgt, other_tgt), dim=0)
         tgt2 = self.norm2(tgt)
+        memory = memory.to(tgt2.dtype).contiguous()
         tgt2 = self.cross_attn_image(
             query=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
-            key=memory.to(tgt2.dtype) + pos if self.pos_enc_at_cross_attn_keys else memory.to(tgt2.dtype),
-            value=memory.to(tgt2.dtype),
+            key=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
+            value=memory,
             attn_mask=memory_mask,
             key_padding_mask=memory_key_padding_mask,
-            # attn_bias=attn_bias,
         )[0]
         tgt = tgt + self.dropout2(tgt2)
         tgt2 = self.norm3(tgt)

ultralytics/models/sam/sam3/geometry_encoders.py CHANGED Viewed

@@ -42,8 +42,8 @@ def concat_padded_sequences(seq1, mask1, seq2, mask2, return_index: bool = False
     assert seq1_length == mask1.size(1)
     assert seq2_length == mask2.size(1)
-    torch._assert_async(is_right_padded(mask1))
-    torch._assert_async(is_right_padded(mask2))
+    torch._assert(is_right_padded(mask1), "Mask is not right padded")
+    torch._assert(is_right_padded(mask2), "Mask is not right padded")
     actual_seq1_lengths = (~mask1).sum(dim=-1)
     actual_seq2_lengths = (~mask2).sum(dim=-1)
@@ -288,7 +288,7 @@ class SequenceGeometryEncoder(nn.Module):
             # Convert boxes to xyxy format and denormalize
             boxes_xyxy = xywh2xyxy(boxes.to(img_feats.dtype))
             scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype)
-            scale = scale.pin_memory().to(device=boxes_xyxy.device, non_blocking=True)
+            scale = scale.to(device=boxes_xyxy.device, non_blocking=True)
             scale = scale.view(1, 1, 4)
             boxes_xyxy = boxes_xyxy * scale

ultralytics/models/sam/sam3/necks.py CHANGED Viewed

@@ -103,27 +103,27 @@ class Sam3DualViTDetNeck(nn.Module):
     def forward(
         self, tensor_list: list[torch.Tensor]
-    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
-        """Get the feature maps and positional encodings from the neck."""
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor] | None, list[torch.Tensor] | None]:
+        """Get feature maps and positional encodings from the neck."""
         xs = self.trunk(tensor_list)
-        sam3_out, sam3_pos = [], []
-        sam2_out, sam2_pos = None, None
-        if self.sam2_convs is not None:
-            sam2_out, sam2_pos = [], []
         x = xs[-1]  # simpleFPN
-        for i in range(len(self.convs)):
-            sam3_x_out = self.convs[i](x)
-            sam3_pos_out = self.position_encoding(sam3_x_out).to(sam3_x_out.dtype)
-            sam3_out.append(sam3_x_out)
-            sam3_pos.append(sam3_pos_out)
-            if self.sam2_convs is not None:
-                sam2_x_out = self.sam2_convs[i](x)
-                sam2_pos_out = self.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
-                sam2_out.append(sam2_x_out)
-                sam2_pos.append(sam2_pos_out)
+        sam3_out, sam3_pos = self.sam_forward_feature_levels(x, self.convs)
+        if self.sam2_convs is None:
+            return sam3_out, sam3_pos, None, None
+        sam2_out, sam2_pos = self.sam_forward_feature_levels(x, self.sam2_convs)
         return sam3_out, sam3_pos, sam2_out, sam2_pos
+    def sam_forward_feature_levels(
+        self, x: torch.Tensor, convs: nn.ModuleList
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Run neck convolutions and compute positional encodings for each feature level."""
+        outs, poss = [], []
+        for conv in convs:
+            feat = conv(x)
+            outs.append(feat)
+            poss.append(self.position_encoding(feat).to(feat.dtype))
+        return outs, poss
     def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
         """Set the image size for the trunk backbone."""
         self.trunk.set_imgsz(imgsz)

ultralytics/models/sam/sam3/sam3_image.py CHANGED Viewed

@@ -11,6 +11,7 @@ import torch
 from ultralytics.nn.modules.utils import inverse_sigmoid
 from ultralytics.utils.ops import xywh2xyxy
+from ..modules.sam import SAM2Model
 from .geometry_encoders import Prompt
 from .vl_combiner import SAM3VLBackbone
@@ -93,25 +94,6 @@ class SAM3SemanticModel(torch.nn.Module):
         self.text_embeddings = {}
         self.names = []
-    def _prepare_backbone_features(self, backbone_out, num_prompts=1):
-        """Prepare and flatten visual features from the image backbone output for further processing."""
-        if num_prompts > 1:  # expand features if there's more than one prompt
-            for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                backbone_out["backbone_fpn"][i] = feat.expand(num_prompts, -1, -1, -1)
-            for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                pos = pos.expand(num_prompts, -1, -1, -1)
-                backbone_out["vision_pos_enc"][i] = pos
-        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
-        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
-        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
-        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
-        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
-        # flatten NxCxHxW to HWxNxC
-        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
-        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
-        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
     def _encode_prompt(
         self,
         img_feats,
@@ -304,8 +286,8 @@ class SAM3SemanticModel(torch.nn.Module):
         self, backbone_out: dict[str, torch.Tensor], text_ids: torch.Tensor, geometric_prompt: Prompt = None
     ):
         """Forward pass for grounding (detection + segmentation) given input images and text."""
-        backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = self._prepare_backbone_features(
-            backbone_out, num_prompts=len(text_ids)
+        backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = SAM2Model._prepare_backbone_features(
+            self, backbone_out, batch=len(text_ids)
         )
         backbone_out.update({k: v for k, v in self.text_embeddings.items()})
         with torch.profiler.record_function("SAM3Image._encode_prompt"):

ultralytics/models/sam/sam3/vl_combiner.py CHANGED Viewed

@@ -110,15 +110,10 @@ class SAM3VLBackbone(nn.Module):
     def forward_image_sam2(self, samples: torch.Tensor):
         """Forward pass of the vision backbone to get SAM2 features only."""
         xs = self.vision_backbone.trunk(samples)
-        sam2_features, sam2_pos = [], []
         x = xs[-1]  # simpleFPN
         assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
-        for i in range(len(self.vision_backbone.sam2_convs)):
-            sam2_x_out = self.vision_backbone.sam2_convs[i](x)
-            sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
-            sam2_features.append(sam2_x_out)
-            sam2_pos.append(sam2_pos_out)
+        sam2_features, sam2_pos = self.vision_backbone.sam_forward_feature_levels(x, self.vision_backbone.sam2_convs)
         if self.scalp > 0:
             # Discard the lowest resolution features

ultralytics/models/yolo/classify/val.py CHANGED Viewed

@@ -57,7 +57,7 @@ class ClassificationValidator(BaseValidator):
         """Initialize ClassificationValidator with dataloader, save directory, and other parameters.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
             save_dir (str | Path, optional): Directory to save results.
             args (dict, optional): Arguments containing model and validation configuration.
             _callbacks (list, optional): List of callback functions to be called during validation.

ultralytics/models/yolo/detect/train.py CHANGED Viewed

@@ -53,7 +53,7 @@ class DetectionTrainer(BaseTrainer):
     """
     def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
-        """Initialize a DetectionTrainer object for training YOLO object detection model training.
+        """Initialize a DetectionTrainer object for training YOLO object detection models.
         Args:
             cfg (dict, optional): Default configuration dictionary containing training parameters.

ultralytics/models/yolo/detect/val.py CHANGED Viewed

@@ -46,7 +46,7 @@ class DetectionValidator(BaseValidator):
         """Initialize detection validator with necessary variables and settings.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
             save_dir (Path, optional): Directory to save results.
             args (dict[str, Any], optional): Arguments for the validator.
             _callbacks (list[Any], optional): List of callback functions.
@@ -256,7 +256,7 @@ class DetectionValidator(BaseValidator):
         pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys)  # print format
         LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
         if self.metrics.nt_per_class.sum() == 0:
-            LOGGER.warning(f"no labels found in {self.args.task} set, can not compute metrics without labels")
+            LOGGER.warning(f"no labels found in {self.args.task} set, cannot compute metrics without labels")
         # Print results per class
         if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
@@ -308,7 +308,7 @@ class DetectionValidator(BaseValidator):
             batch_size (int): Size of each batch.
         Returns:
-            (torch.utils.data.DataLoader): Dataloader for validation.
+            (torch.utils.data.DataLoader): DataLoader for validation.
         """
         dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
         return build_dataloader(
@@ -460,11 +460,11 @@ class DetectionValidator(BaseValidator):
         Args:
             stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
-            pred_json (str | Path]): Path to JSON file containing predictions in COCO format.
-            anno_json (str | Path]): Path to JSON file containing ground truth annotations in COCO format.
-            iou_types (str | list[str]]): IoU type(s) for evaluation. Can be single string or list of strings. Common
+            pred_json (str | Path): Path to JSON file containing predictions in COCO format.
+            anno_json (str | Path): Path to JSON file containing ground truth annotations in COCO format.
+            iou_types (str | list[str]): IoU type(s) for evaluation. Can be single string or list of strings. Common
                 values include "bbox", "segm", "keypoints". Defaults to "bbox".
-            suffix (str | list[str]]): Suffix to append to metric names in stats dictionary. Should correspond to
+            suffix (str | list[str]): Suffix to append to metric names in stats dictionary. Should correspond to
                 iou_types if multiple types provided. Defaults to "Box".
         Returns:

ultralytics/models/yolo/obb/val.py CHANGED Viewed

@@ -50,7 +50,7 @@ class OBBValidator(DetectionValidator):
         extends the DetectionValidator class and configures it specifically for the OBB task.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
             save_dir (str | Path, optional): Directory to save results.
             args (dict | SimpleNamespace, optional): Arguments containing validation parameters.
             _callbacks (list, optional): List of callback functions to be called during validation.

ultralytics/models/yolo/pose/val.py CHANGED Viewed

@@ -59,7 +59,7 @@ class PoseValidator(DetectionValidator):
         specialized metrics for pose evaluation.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
             save_dir (Path | str, optional): Directory to save results.
             args (dict, optional): Arguments for the validator including task set to "pose".
             _callbacks (list, optional): List of callback functions to be executed during validation.

ultralytics/models/yolo/segment/val.py CHANGED Viewed

@@ -39,7 +39,7 @@ class SegmentationValidator(DetectionValidator):
         """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
             save_dir (Path, optional): Directory to save results.
             args (namespace, optional): Arguments for the validator.
             _callbacks (list, optional): List of callback functions.

ultralytics/nn/autobackend.py CHANGED Viewed

@@ -127,7 +127,7 @@ class AutoBackend(nn.Module):
     Methods:
         forward: Run inference on an input image.
-        from_numpy: Convert numpy array to tensor.
+        from_numpy: Convert NumPy arrays to tensors on the model device.
         warmup: Warm up the model with a dummy input.
         _model_type: Determine the model type from file path.
@@ -182,7 +182,7 @@ class AutoBackend(nn.Module):
             triton,
         ) = self._model_type("" if nn_module else model)
         fp16 &= pt or jit or onnx or xml or engine or nn_module or triton  # FP16
-        nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn  # BHWC formats (vs torch BCWH)
+        nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn  # BHWC formats (vs torch BCHW)
         stride, ch = 32, 3  # default stride and channels
         end2end, dynamic = False, False
         metadata, task = None, None
@@ -894,14 +894,14 @@ class AutoBackend(nn.Module):
         else:
             return self.from_numpy(y)
-    def from_numpy(self, x: np.ndarray) -> torch.Tensor:
-        """Convert a numpy array to a tensor.
+    def from_numpy(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
+        """Convert a NumPy array to a torch tensor on the model device.
         Args:
-            x (np.ndarray): The array to be converted.
+            x (np.ndarray | torch.Tensor): Input array or tensor.
         Returns:
-            (torch.Tensor): The converted tensor
+            (torch.Tensor): Tensor on `self.device`.
         """
         return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
@@ -909,7 +909,7 @@ class AutoBackend(nn.Module):
         """Warm up the model by running one forward pass with a dummy input.
         Args:
-            imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
+            imgsz (tuple[int, int, int, int]): Dummy input shape in (batch, channels, height, width) format.
         """
         warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
         if any(warmup_types) and (self.device.type != "cpu" or self.triton):
@@ -931,8 +931,8 @@ class AutoBackend(nn.Module):
             (list[bool]): List of booleans indicating the model type.
         Examples:
-            >>> model = AutoBackend(model="path/to/model.onnx")
-            >>> model_type = model._model_type()  # returns "onnx"
+            >>> types = AutoBackend._model_type("path/to/model.onnx")
+            >>> assert types[2]  # onnx
         """
         from ultralytics.engine.exporter import export_formats

ultralytics/nn/modules/block.py CHANGED Viewed

@@ -1812,7 +1812,7 @@ class A2C2f(nn.Module):
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
-        assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
+        assert c_ % 32 == 0, "Dimension of ABlock must be a multiple of 32."
         self.cv1 = Conv(c1, c_, 1, 1)
         self.cv2 = Conv((1 + n) * c_, c2, 1)

ultralytics/nn/tasks.py CHANGED Viewed

@@ -866,7 +866,7 @@ class WorldModel(DetectionModel):
         self.model[-1].nc = len(text)
     def get_text_pe(self, text, batch=80, cache_clip_model=True):
-        """Set classes in advance so that model could do offline-inference without clip model.
+        """Get text positional embeddings for offline inference without CLIP model.
         Args:
             text (list[str]): List of class names.
@@ -987,13 +987,13 @@ class YOLOEModel(DetectionModel):
     @smart_inference_mode()
     def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
-        """Set classes in advance so that model could do offline-inference without clip model.
+        """Get text positional embeddings for offline inference without CLIP model.
         Args:
             text (list[str]): List of class names.
             batch (int): Batch size for processing text tokens.
             cache_clip_model (bool): Whether to cache the CLIP model.
-            without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
+            without_reprta (bool): Whether to return text embeddings without reprta module processing.
         Returns:
             (torch.Tensor): Text positional embeddings.

dgenerate-ultralytics-headless 8.3.237__py3-none-any.whl → 8.3.240__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.237py3-none-any.whl → 8.3.240py3-none-any.whl