PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.237__py3-none-any.whl → 8.3.239__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.237py3-none-any.whl → 8.3.239py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/METADATA +1 -1
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/RECORD +104 -105
tests/test_exports.py +3 -1
tests/test_python.py +2 -2
tests/test_solutions.py +6 -6
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +4 -4
ultralytics/cfg/datasets/Argoverse.yaml +7 -6
ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
ultralytics/cfg/datasets/VOC.yaml +15 -16
ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
ultralytics/cfg/datasets/dota8.yaml +2 -2
ultralytics/cfg/datasets/kitti.yaml +1 -1
ultralytics/cfg/datasets/xView.yaml +16 -16
ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
ultralytics/data/augment.py +1 -1
ultralytics/data/base.py +4 -2
ultralytics/data/build.py +4 -4
ultralytics/data/loaders.py +17 -12
ultralytics/data/utils.py +4 -4
ultralytics/engine/exporter.py +24 -16
ultralytics/engine/predictor.py +5 -4
ultralytics/engine/results.py +12 -13
ultralytics/engine/trainer.py +2 -2
ultralytics/engine/tuner.py +2 -3
ultralytics/engine/validator.py +2 -2
ultralytics/models/fastsam/model.py +2 -2
ultralytics/models/fastsam/predict.py +2 -3
ultralytics/models/fastsam/val.py +4 -4
ultralytics/models/rtdetr/predict.py +2 -3
ultralytics/models/rtdetr/val.py +5 -4
ultralytics/models/sam/build.py +5 -5
ultralytics/models/sam/build_sam3.py +9 -6
ultralytics/models/sam/model.py +1 -1
ultralytics/models/sam/modules/sam.py +10 -5
ultralytics/models/sam/predict.py +24 -48
ultralytics/models/sam/sam3/encoder.py +4 -4
ultralytics/models/sam/sam3/geometry_encoders.py +3 -3
ultralytics/models/sam/sam3/necks.py +17 -17
ultralytics/models/sam/sam3/sam3_image.py +3 -21
ultralytics/models/sam/sam3/vl_combiner.py +1 -6
ultralytics/models/yolo/classify/val.py +1 -1
ultralytics/models/yolo/detect/train.py +1 -1
ultralytics/models/yolo/detect/val.py +7 -7
ultralytics/models/yolo/obb/val.py +1 -1
ultralytics/models/yolo/pose/val.py +1 -1
ultralytics/models/yolo/segment/val.py +1 -1
ultralytics/nn/autobackend.py +9 -9
ultralytics/nn/modules/block.py +1 -1
ultralytics/nn/tasks.py +3 -3
ultralytics/nn/text_model.py +2 -7
ultralytics/solutions/ai_gym.py +1 -1
ultralytics/solutions/analytics.py +6 -6
ultralytics/solutions/config.py +1 -1
ultralytics/solutions/distance_calculation.py +1 -1
ultralytics/solutions/object_counter.py +1 -1
ultralytics/solutions/object_cropper.py +3 -6
ultralytics/solutions/parking_management.py +21 -17
ultralytics/solutions/queue_management.py +5 -5
ultralytics/solutions/region_counter.py +2 -2
ultralytics/solutions/security_alarm.py +1 -1
ultralytics/solutions/solutions.py +45 -22
ultralytics/solutions/speed_estimation.py +1 -1
ultralytics/trackers/basetrack.py +1 -1
ultralytics/trackers/bot_sort.py +4 -3
ultralytics/trackers/byte_tracker.py +4 -4
ultralytics/trackers/utils/gmc.py +6 -7
ultralytics/trackers/utils/kalman_filter.py +2 -1
ultralytics/trackers/utils/matching.py +4 -3
ultralytics/utils/__init__.py +12 -3
ultralytics/utils/benchmarks.py +2 -2
ultralytics/utils/callbacks/tensorboard.py +19 -25
ultralytics/utils/checks.py +2 -1
ultralytics/utils/downloads.py +1 -1
ultralytics/utils/export/tensorflow.py +16 -2
ultralytics/utils/files.py +13 -12
ultralytics/utils/logger.py +62 -27
ultralytics/utils/metrics.py +1 -1
ultralytics/utils/ops.py +6 -6
ultralytics/utils/patches.py +3 -3
ultralytics/utils/plotting.py +7 -12
ultralytics/utils/tuner.py +1 -1
ultralytics/models/sam/sam3/tokenizer_ve.py +0 -242
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -110,10 +110,12 @@ class Predictor(BasePredictor):
         """Preprocess the input image for model inference.
         This method prepares the input image by applying transformations and normalization. It supports both
-        torch.Tensor and list of np.ndarray as input formats.
+        torch.Tensor and list of np.ndarray as input formats. For OpenCV-loaded images, the input is typically BGR and
+        is converted to RGB during preprocessing.
         Args:
-            im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC numpy arrays.
+            im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or a list of HWC NumPy arrays.
+                NumPy arrays are expected to be in BGR order (as returned by OpenCV) and will be converted to RGB.
         Returns:
             (torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
@@ -534,7 +536,7 @@ class Predictor(BasePredictor):
         Args:
             image (str | np.ndarray): Path to the image file as a string, or a numpy array representing an image read by
-                cv2.
+                cv2 (BGR channel order).
         Raises:
             AssertionError: If more than one image is attempted to be set.
@@ -1244,14 +1246,11 @@ class SAM2VideoPredictor(SAM2Predictor):
             - If `batch` is greater than 1, the features are expanded to fit the batch size.
             - The method leverages the model's `_prepare_backbone_features` method to prepare the backbone features.
         """
-        backbone_out = self.model.forward_image(im)
-        if batch > 1:  # expand features if there's more than one prompt
-            for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
-            for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                pos = pos.expand(batch, -1, -1, -1)
-                backbone_out["vision_pos_enc"][i] = pos
-        _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
+        # check if there's precomputed backbone output
+        backbone_out = getattr(self, "backbone_out", None)
+        if backbone_out is None:
+            backbone_out = self.model.forward_image(im)
+        _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out, batch=batch)
         return vis_feats, vis_pos_embed, feat_sizes
     def _obj_id_to_idx(self, obj_id, inference_state: dict[str, Any] | None = None):
@@ -2055,11 +2054,12 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
         self.memory_bank.append(consolidated_out)
     def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
-        """Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
-        prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features
-        for all objects in the image. If there is no memory, it will directly add a no-memory embedding to the
-        current vision features. If there is memory, it will use the memory features from previous frames to
-        condition the current vision features using a transformer attention mechanism.
+        """Prepare memory-conditioned features for the current image state.
+        If ``obj_idx`` is provided, features are prepared for a specific prompted object in the image. If ``obj_idx`` is
+        None, features are prepared for all objects. If no memory is available, a no-memory embedding is added to the
+        current vision features. Otherwise, memory from previous frames is used to condition the current vision features
+        via a transformer attention mechanism.
         Args:
             obj_idx (int | None): The index of the object for which to prepare the features.
@@ -2068,8 +2068,8 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             pix_feat_with_mem (torch.Tensor): The memory-conditioned pixel features.
         """
         if len(self.memory_bank) == 0 or isinstance(obj_idx, int):
-            # for initial conditioning frames with, encode them without using any previous memory
-            # directly add no-mem embedding (instead of using the transformer encoder)
+            # For initial conditioning frames, encode without using any previous memory.
+            # Directly add the no-memory embedding (instead of using the transformer encoder).
             pix_feat_with_mem = self.vision_feats[-1] + self.model.no_mem_embed
         else:
             # for inference frames, use the memory features from previous frames
@@ -2081,7 +2081,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
                 memory_pos=memory_pos_embed,
                 num_obj_ptr_tokens=0,  # num_obj_ptr_tokens
             )
-        # reshape the output (HW)BC => BCHW
+        # Reshape output (HW)BC => BCHW
         return pix_feat_with_mem.permute(1, 2, 0).view(
             self._max_obj_num,
             self.model.memory_attention.d_model,
@@ -2145,9 +2145,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             pix_feat = pix_feat.view(-1, self.model.memory_attention.d_model, *self.feat_sizes[-1])
             _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._use_mask_as_output(mask)
         else:
-            # fused the visual feature with previous memory features in the memory bank
+            # Fuse visual features with previous memory features in the memory bank.
             pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
-            # calculate the first feature if adding obj_idx exists(means adding prompts)
+            # If ``obj_idx`` is provided (i.e., prompts are being added), keep only the first feature map.
             pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
             _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
                 backbone_features=pix_feat_with_mem,
@@ -2182,7 +2182,7 @@ class SAM3Predictor(SAM2Predictor):
         self.std = torch.tensor([127.5, 127.5, 127.5]).view(-1, 1, 1).to(self.device)
     def get_model(self):
-        """Retrieve and initialize the Segment Anything Model 2 (SAM2) for image segmentation tasks."""
+        """Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
         from .build_sam3 import build_interactive_sam3  # slow import
         return build_interactive_sam3(self.args.model, compile=self.args.compile)
@@ -2191,16 +2191,11 @@ class SAM3Predictor(SAM2Predictor):
 class SAM3SemanticPredictor(SAM3Predictor):
     """Segment Anything Model 3 (SAM3) Predictor for image segmentation tasks."""
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None, bpe_path=None):
-        """Initialize the SAM3SemanticPredictor with configuration and optional overrides."""
-        super().__init__(cfg, overrides, _callbacks)
-        self.bpe_path = bpe_path
     def get_model(self):
         """Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
         from .build_sam3 import build_sam3_image_model  # slow import
-        return build_sam3_image_model(self.args.model, bpe_path=self.bpe_path, compile=self.args.compile)
+        return build_sam3_image_model(self.args.model, compile=self.args.compile)
     @smart_inference_mode()
     def get_im_features(self, im):
@@ -2437,24 +2432,6 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
         return obj_ids, pred_masks, obj_scores
-    def get_im_features(self, im, batch=1):
-        """A wrapper to get image features, supporting pre-extracted backbone outputs."""
-        if getattr(self, "backbone_out", None):
-            backbone_out = self.backbone_out
-            if batch > 1:  # expand features if there's more than one prompt
-                backbone_out = {
-                    "backbone_fpn": backbone_out["backbone_fpn"].copy(),
-                    "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
-                }
-                for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                    backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
-                for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                    pos = pos.expand(batch, -1, -1, -1)
-                    backbone_out["vision_pos_enc"][i] = pos
-            _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
-            return vis_feats, vis_pos_embed, feat_sizes
-        return super().get_im_features(im, batch)
 class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
     """Segment Anything Model 3 (SAM3) Video Semantic Predictor."""
@@ -2479,7 +2456,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         cfg=DEFAULT_CFG,
         overrides=None,
         _callbacks=None,
-        bpe_path="bpe_simple_vocab_16e6.txt.gz",
         # prob threshold for detection outputs -- only keep detections above this threshold
         # enters NMS and det-to-track matching
         score_threshold_detection=0.5,
@@ -2523,7 +2499,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
         reconstruction_bbox_det_score=0.0,
     ):
         """Initialize the SAM3VideoSemanticPredictor with configuration and optional overrides."""
-        super().__init__(cfg, overrides, _callbacks, bpe_path=bpe_path)
+        super().__init__(cfg, overrides, _callbacks)
         self.score_threshold_detection = score_threshold_detection
         self.det_nms_thresh = det_nms_thresh
         self.assoc_iou_thresh = assoc_iou_thresh

ultralytics/models/sam/sam3/encoder.py CHANGED Viewed

@@ -171,7 +171,7 @@ class TransformerEncoderLayer(nn.Module):
             assert tgt.shape[0] % 2 == 0
             other_tgt = tgt[tgt.shape[0] // 2 :]
             tgt = tgt[: tgt.shape[0] // 2]
-        tgt2 = self.norm1(tgt)
+        tgt2 = self.norm1(tgt).contiguous()
         q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
         tgt = tgt + self.dropout1(tgt2)
@@ -179,13 +179,13 @@ class TransformerEncoderLayer(nn.Module):
             # Recombine
             tgt = torch.cat((tgt, other_tgt), dim=0)
         tgt2 = self.norm2(tgt)
+        memory = memory.to(tgt2.dtype).contiguous()
         tgt2 = self.cross_attn_image(
             query=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
-            key=memory.to(tgt2.dtype) + pos if self.pos_enc_at_cross_attn_keys else memory.to(tgt2.dtype),
-            value=memory.to(tgt2.dtype),
+            key=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
+            value=memory,
             attn_mask=memory_mask,
             key_padding_mask=memory_key_padding_mask,
-            # attn_bias=attn_bias,
         )[0]
         tgt = tgt + self.dropout2(tgt2)
         tgt2 = self.norm3(tgt)

ultralytics/models/sam/sam3/geometry_encoders.py CHANGED Viewed

@@ -42,8 +42,8 @@ def concat_padded_sequences(seq1, mask1, seq2, mask2, return_index: bool = False
     assert seq1_length == mask1.size(1)
     assert seq2_length == mask2.size(1)
-    torch._assert_async(is_right_padded(mask1))
-    torch._assert_async(is_right_padded(mask2))
+    torch._assert(is_right_padded(mask1), "Mask is not right padded")
+    torch._assert(is_right_padded(mask2), "Mask is not right padded")
     actual_seq1_lengths = (~mask1).sum(dim=-1)
     actual_seq2_lengths = (~mask2).sum(dim=-1)
@@ -288,7 +288,7 @@ class SequenceGeometryEncoder(nn.Module):
             # Convert boxes to xyxy format and denormalize
             boxes_xyxy = xywh2xyxy(boxes.to(img_feats.dtype))
             scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype)
-            scale = scale.pin_memory().to(device=boxes_xyxy.device, non_blocking=True)
+            scale = scale.to(device=boxes_xyxy.device, non_blocking=True)
             scale = scale.view(1, 1, 4)
             boxes_xyxy = boxes_xyxy * scale

ultralytics/models/sam/sam3/necks.py CHANGED Viewed

@@ -103,27 +103,27 @@ class Sam3DualViTDetNeck(nn.Module):
     def forward(
         self, tensor_list: list[torch.Tensor]
-    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
-        """Get the feature maps and positional encodings from the neck."""
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor] | None, list[torch.Tensor] | None]:
+        """Get feature maps and positional encodings from the neck."""
         xs = self.trunk(tensor_list)
-        sam3_out, sam3_pos = [], []
-        sam2_out, sam2_pos = None, None
-        if self.sam2_convs is not None:
-            sam2_out, sam2_pos = [], []
         x = xs[-1]  # simpleFPN
-        for i in range(len(self.convs)):
-            sam3_x_out = self.convs[i](x)
-            sam3_pos_out = self.position_encoding(sam3_x_out).to(sam3_x_out.dtype)
-            sam3_out.append(sam3_x_out)
-            sam3_pos.append(sam3_pos_out)
-            if self.sam2_convs is not None:
-                sam2_x_out = self.sam2_convs[i](x)
-                sam2_pos_out = self.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
-                sam2_out.append(sam2_x_out)
-                sam2_pos.append(sam2_pos_out)
+        sam3_out, sam3_pos = self.sam_forward_feature_levels(x, self.convs)
+        if self.sam2_convs is None:
+            return sam3_out, sam3_pos, None, None
+        sam2_out, sam2_pos = self.sam_forward_feature_levels(x, self.sam2_convs)
         return sam3_out, sam3_pos, sam2_out, sam2_pos
+    def sam_forward_feature_levels(
+        self, x: torch.Tensor, convs: nn.ModuleList
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Run neck convolutions and compute positional encodings for each feature level."""
+        outs, poss = [], []
+        for conv in convs:
+            feat = conv(x)
+            outs.append(feat)
+            poss.append(self.position_encoding(feat).to(feat.dtype))
+        return outs, poss
     def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
         """Set the image size for the trunk backbone."""
         self.trunk.set_imgsz(imgsz)

ultralytics/models/sam/sam3/sam3_image.py CHANGED Viewed

@@ -11,6 +11,7 @@ import torch
 from ultralytics.nn.modules.utils import inverse_sigmoid
 from ultralytics.utils.ops import xywh2xyxy
+from ..modules.sam import SAM2Model
 from .geometry_encoders import Prompt
 from .vl_combiner import SAM3VLBackbone
@@ -93,25 +94,6 @@ class SAM3SemanticModel(torch.nn.Module):
         self.text_embeddings = {}
         self.names = []
-    def _prepare_backbone_features(self, backbone_out, num_prompts=1):
-        """Prepare and flatten visual features from the image backbone output for further processing."""
-        if num_prompts > 1:  # expand features if there's more than one prompt
-            for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                backbone_out["backbone_fpn"][i] = feat.expand(num_prompts, -1, -1, -1)
-            for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                pos = pos.expand(num_prompts, -1, -1, -1)
-                backbone_out["vision_pos_enc"][i] = pos
-        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
-        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
-        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
-        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
-        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
-        # flatten NxCxHxW to HWxNxC
-        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
-        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
-        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
     def _encode_prompt(
         self,
         img_feats,
@@ -304,8 +286,8 @@ class SAM3SemanticModel(torch.nn.Module):
         self, backbone_out: dict[str, torch.Tensor], text_ids: torch.Tensor, geometric_prompt: Prompt = None
     ):
         """Forward pass for grounding (detection + segmentation) given input images and text."""
-        backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = self._prepare_backbone_features(
-            backbone_out, num_prompts=len(text_ids)
+        backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = SAM2Model._prepare_backbone_features(
+            self, backbone_out, batch=len(text_ids)
         )
         backbone_out.update({k: v for k, v in self.text_embeddings.items()})
         with torch.profiler.record_function("SAM3Image._encode_prompt"):

ultralytics/models/sam/sam3/vl_combiner.py CHANGED Viewed

@@ -110,15 +110,10 @@ class SAM3VLBackbone(nn.Module):
     def forward_image_sam2(self, samples: torch.Tensor):
         """Forward pass of the vision backbone to get SAM2 features only."""
         xs = self.vision_backbone.trunk(samples)
-        sam2_features, sam2_pos = [], []
         x = xs[-1]  # simpleFPN
         assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
-        for i in range(len(self.vision_backbone.sam2_convs)):
-            sam2_x_out = self.vision_backbone.sam2_convs[i](x)
-            sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
-            sam2_features.append(sam2_x_out)
-            sam2_pos.append(sam2_pos_out)
+        sam2_features, sam2_pos = self.vision_backbone.sam_forward_feature_levels(x, self.vision_backbone.sam2_convs)
         if self.scalp > 0:
             # Discard the lowest resolution features

ultralytics/models/yolo/classify/val.py CHANGED Viewed

@@ -57,7 +57,7 @@ class ClassificationValidator(BaseValidator):
         """Initialize ClassificationValidator with dataloader, save directory, and other parameters.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
             save_dir (str | Path, optional): Directory to save results.
             args (dict, optional): Arguments containing model and validation configuration.
             _callbacks (list, optional): List of callback functions to be called during validation.

ultralytics/models/yolo/detect/train.py CHANGED Viewed

@@ -53,7 +53,7 @@ class DetectionTrainer(BaseTrainer):
     """
     def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
-        """Initialize a DetectionTrainer object for training YOLO object detection model training.
+        """Initialize a DetectionTrainer object for training YOLO object detection models.
         Args:
             cfg (dict, optional): Default configuration dictionary containing training parameters.

ultralytics/models/yolo/detect/val.py CHANGED Viewed

@@ -46,7 +46,7 @@ class DetectionValidator(BaseValidator):
         """Initialize detection validator with necessary variables and settings.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
             save_dir (Path, optional): Directory to save results.
             args (dict[str, Any], optional): Arguments for the validator.
             _callbacks (list[Any], optional): List of callback functions.
@@ -256,7 +256,7 @@ class DetectionValidator(BaseValidator):
         pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys)  # print format
         LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
         if self.metrics.nt_per_class.sum() == 0:
-            LOGGER.warning(f"no labels found in {self.args.task} set, can not compute metrics without labels")
+            LOGGER.warning(f"no labels found in {self.args.task} set, cannot compute metrics without labels")
         # Print results per class
         if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
@@ -308,7 +308,7 @@ class DetectionValidator(BaseValidator):
             batch_size (int): Size of each batch.
         Returns:
-            (torch.utils.data.DataLoader): Dataloader for validation.
+            (torch.utils.data.DataLoader): DataLoader for validation.
         """
         dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
         return build_dataloader(
@@ -460,11 +460,11 @@ class DetectionValidator(BaseValidator):
         Args:
             stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
-            pred_json (str | Path]): Path to JSON file containing predictions in COCO format.
-            anno_json (str | Path]): Path to JSON file containing ground truth annotations in COCO format.
-            iou_types (str | list[str]]): IoU type(s) for evaluation. Can be single string or list of strings. Common
+            pred_json (str | Path): Path to JSON file containing predictions in COCO format.
+            anno_json (str | Path): Path to JSON file containing ground truth annotations in COCO format.
+            iou_types (str | list[str]): IoU type(s) for evaluation. Can be single string or list of strings. Common
                 values include "bbox", "segm", "keypoints". Defaults to "bbox".
-            suffix (str | list[str]]): Suffix to append to metric names in stats dictionary. Should correspond to
+            suffix (str | list[str]): Suffix to append to metric names in stats dictionary. Should correspond to
                 iou_types if multiple types provided. Defaults to "Box".
         Returns:

ultralytics/models/yolo/obb/val.py CHANGED Viewed

@@ -50,7 +50,7 @@ class OBBValidator(DetectionValidator):
         extends the DetectionValidator class and configures it specifically for the OBB task.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
             save_dir (str | Path, optional): Directory to save results.
             args (dict | SimpleNamespace, optional): Arguments containing validation parameters.
             _callbacks (list, optional): List of callback functions to be called during validation.

ultralytics/models/yolo/pose/val.py CHANGED Viewed

@@ -59,7 +59,7 @@ class PoseValidator(DetectionValidator):
         specialized metrics for pose evaluation.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
             save_dir (Path | str, optional): Directory to save results.
             args (dict, optional): Arguments for the validator including task set to "pose".
             _callbacks (list, optional): List of callback functions to be executed during validation.

ultralytics/models/yolo/segment/val.py CHANGED Viewed

@@ -39,7 +39,7 @@ class SegmentationValidator(DetectionValidator):
         """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.
         Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
             save_dir (Path, optional): Directory to save results.
             args (namespace, optional): Arguments for the validator.
             _callbacks (list, optional): List of callback functions.

ultralytics/nn/autobackend.py CHANGED Viewed

@@ -127,7 +127,7 @@ class AutoBackend(nn.Module):
     Methods:
         forward: Run inference on an input image.
-        from_numpy: Convert numpy array to tensor.
+        from_numpy: Convert NumPy arrays to tensors on the model device.
         warmup: Warm up the model with a dummy input.
         _model_type: Determine the model type from file path.
@@ -182,7 +182,7 @@ class AutoBackend(nn.Module):
             triton,
         ) = self._model_type("" if nn_module else model)
         fp16 &= pt or jit or onnx or xml or engine or nn_module or triton  # FP16
-        nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn  # BHWC formats (vs torch BCWH)
+        nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn  # BHWC formats (vs torch BCHW)
         stride, ch = 32, 3  # default stride and channels
         end2end, dynamic = False, False
         metadata, task = None, None
@@ -894,14 +894,14 @@ class AutoBackend(nn.Module):
         else:
             return self.from_numpy(y)
-    def from_numpy(self, x: np.ndarray) -> torch.Tensor:
-        """Convert a numpy array to a tensor.
+    def from_numpy(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
+        """Convert a NumPy array to a torch tensor on the model device.
         Args:
-            x (np.ndarray): The array to be converted.
+            x (np.ndarray | torch.Tensor): Input array or tensor.
         Returns:
-            (torch.Tensor): The converted tensor
+            (torch.Tensor): Tensor on `self.device`.
         """
         return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
@@ -909,7 +909,7 @@ class AutoBackend(nn.Module):
         """Warm up the model by running one forward pass with a dummy input.
         Args:
-            imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
+            imgsz (tuple[int, int, int, int]): Dummy input shape in (batch, channels, height, width) format.
         """
         warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
         if any(warmup_types) and (self.device.type != "cpu" or self.triton):
@@ -931,8 +931,8 @@ class AutoBackend(nn.Module):
             (list[bool]): List of booleans indicating the model type.
         Examples:
-            >>> model = AutoBackend(model="path/to/model.onnx")
-            >>> model_type = model._model_type()  # returns "onnx"
+            >>> types = AutoBackend._model_type("path/to/model.onnx")
+            >>> assert types[2]  # onnx
         """
         from ultralytics.engine.exporter import export_formats

ultralytics/nn/modules/block.py CHANGED Viewed

@@ -1812,7 +1812,7 @@ class A2C2f(nn.Module):
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
-        assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
+        assert c_ % 32 == 0, "Dimension of ABlock must be a multiple of 32."
         self.cv1 = Conv(c1, c_, 1, 1)
         self.cv2 = Conv((1 + n) * c_, c2, 1)

ultralytics/nn/tasks.py CHANGED Viewed

@@ -866,7 +866,7 @@ class WorldModel(DetectionModel):
         self.model[-1].nc = len(text)
     def get_text_pe(self, text, batch=80, cache_clip_model=True):
-        """Set classes in advance so that model could do offline-inference without clip model.
+        """Get text positional embeddings for offline inference without CLIP model.
         Args:
             text (list[str]): List of class names.
@@ -987,13 +987,13 @@ class YOLOEModel(DetectionModel):
     @smart_inference_mode()
     def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
-        """Set classes in advance so that model could do offline-inference without clip model.
+        """Get text positional embeddings for offline inference without CLIP model.
         Args:
             text (list[str]): List of class names.
             batch (int): Batch size for processing text tokens.
             cache_clip_model (bool): Whether to cache the CLIP model.
-            without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
+            without_reprta (bool): Whether to return text embeddings without reprta module processing.
         Returns:
             (torch.Tensor): Text positional embeddings.

ultralytics/nn/text_model.py CHANGED Viewed

@@ -196,12 +196,7 @@ class MobileCLIP(TextModel):
             device (torch.device): Device to load the model on.
         """
         try:
-            import warnings
-            # Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", category=FutureWarning)
-                import mobileclip
+            import mobileclip
         except ImportError:
             # Ultralytics fork preferred since Apple MobileCLIP repo has incorrect version of torchvision
             checks.check_requirements("git+https://github.com/ultralytics/mobileclip.git")
@@ -308,7 +303,7 @@ class MobileCLIPTS(TextModel):
             (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
         Examples:
-            >>> model = MobileCLIPTS("cpu")
+            >>> model = MobileCLIPTS(device=torch.device("cpu"))
             >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
             >>> strict_tokens = model.tokenize(
             ...     ["a very long caption"], truncate=False

ultralytics/solutions/ai_gym.py CHANGED Viewed

@@ -13,7 +13,7 @@ class AIGym(BaseSolution):
     repetitions of exercises based on predefined angle thresholds for up and down positions.
     Attributes:
-        states (dict[float, int, str]): Stores per-track angle, count, and stage for workout monitoring.
+        states (dict[int, dict[str, float | int | str]]): Per-track angle, rep count, and stage for workout monitoring.
         up_angle (float): Angle threshold for considering the 'up' position of an exercise.
         down_angle (float): Angle threshold for considering the 'down' position of an exercise.
         kpts (list[int]): Indices of keypoints used for angle calculation.

ultralytics/solutions/analytics.py CHANGED Viewed

@@ -56,7 +56,7 @@ class Analytics(BaseSolution):
         from matplotlib.backends.backend_agg import FigureCanvasAgg
         from matplotlib.figure import Figure
-        self.type = self.CFG["analytics_type"]  # type of analytics i.e "line", "pie", "bar" or "area" charts.
+        self.type = self.CFG["analytics_type"]  # Chart type: "line", "pie", "bar", or "area".
         self.x_label = "Classes" if self.type in {"bar", "pie"} else "Frame#"
         self.y_label = "Total Counts"
@@ -66,10 +66,10 @@ class Analytics(BaseSolution):
         self.title = "Ultralytics Solutions"  # window name
         self.max_points = 45  # maximum points to be drawn on window
         self.fontsize = 25  # text font size for display
-        figsize = self.CFG["figsize"]  # set output image size i.e (12.8, 7.2) -> w = 1280, h = 720
+        figsize = self.CFG["figsize"]  # Output size, e.g. (12.8, 7.2) -> 1280x720.
         self.color_cycle = cycle(["#DD00BA", "#042AFF", "#FF4447", "#7D24FF", "#BD00FF"])
-        self.total_counts = 0  # count variable for storing total counts i.e. for line
+        self.total_counts = 0  # Stores total counts for line charts.
         self.clswise_count = {}  # dictionary for class-wise counts
         self.update_every = kwargs.get("update_every", 30)  # Only update graph every 30 frames by default
         self.last_plot_im = None  # Cache of the last rendered chart
@@ -104,7 +104,7 @@ class Analytics(BaseSolution):
                 and 'classwise_count' (dict, per-class object count).
         Raises:
-            ModuleNotFoundError: If an unsupported chart type is specified.
+            ValueError: If an unsupported chart type is specified.
         Examples:
             >>> analytics = Analytics(analytics_type="line")
@@ -131,9 +131,9 @@ class Analytics(BaseSolution):
                 )
             plot_im = self.last_plot_im
         else:
-            raise ModuleNotFoundError(f"{self.type} chart is not supported ❌")
+            raise ValueError(f"Unsupported analytics_type='{self.type}'. Supported types: line, bar, pie, area.")
-        # return output dictionary with summary for more usage
+        # Return results for downstream use.
         return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), classwise_count=self.clswise_count)
     def update_graph(

ultralytics/solutions/config.py CHANGED Viewed

@@ -35,7 +35,7 @@ class SolutionConfig:
         vision_point (tuple[int, int]): Reference point for directional tracking or perspective drawing.
         crop_dir (str): Directory path to save cropped detection images.
         json_file (str): Path to a JSON file containing data for parking areas.
-        line_width (int): Width for visual display i.e. bounding boxes, keypoints, counts.
+        line_width (int): Width for visual display, e.g. bounding boxes, keypoints, and counts.
         records (int): Number of detection records to send email alerts.
         fps (float): Frame rate (Frames Per Second) for speed estimation calculation.
         max_hist (int): Maximum number of historical points or states stored per tracked object for speed estimation.

ultralytics/solutions/distance_calculation.py CHANGED Viewed

@@ -17,7 +17,7 @@ class DistanceCalculation(BaseSolution):
     Attributes:
         left_mouse_count (int): Counter for left mouse button clicks.
-        selected_boxes (dict[int, list[float]]): Dictionary to store selected bounding boxes and their track IDs.
+        selected_boxes (dict[int, Any]): Dictionary to store selected bounding boxes keyed by track ID.
         centroids (list[list[int]]): List to store centroids of selected bounding boxes.
     Methods:

ultralytics/solutions/object_counter.py CHANGED Viewed

@@ -19,7 +19,7 @@ class ObjectCounter(BaseSolution):
         in_count (int): Counter for objects moving inward.
         out_count (int): Counter for objects moving outward.
         counted_ids (list[int]): List of IDs of objects that have been counted.
-        classwise_counts (dict[str, dict[str, int]]): Dictionary for counts, categorized by object class.
+        classwise_count (dict[str, dict[str, int]]): Dictionary for counts, categorized by object class.
         region_initialized (bool): Flag indicating whether the counting region has been initialized.
         show_in (bool): Flag to control display of inward count.
         show_out (bool): Flag to control display of outward count.

dgenerate-ultralytics-headless 8.3.237__py3-none-any.whl → 8.3.239__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.237py3-none-any.whl → 8.3.239py3-none-any.whl