PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.236__py3-none-any.whl → 8.3.237__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.236py3-none-any.whl → 8.3.237py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/METADATA +1 -1
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/RECORD +38 -25
ultralytics/__init__.py +1 -1
ultralytics/engine/exporter.py +17 -10
ultralytics/engine/predictor.py +3 -2
ultralytics/engine/trainer.py +8 -0
ultralytics/models/rtdetr/val.py +5 -1
ultralytics/models/sam/__init__.py +14 -1
ultralytics/models/sam/build.py +17 -8
ultralytics/models/sam/build_sam3.py +374 -0
ultralytics/models/sam/model.py +12 -4
ultralytics/models/sam/modules/blocks.py +20 -8
ultralytics/models/sam/modules/decoders.py +2 -3
ultralytics/models/sam/modules/encoders.py +4 -1
ultralytics/models/sam/modules/memory_attention.py +6 -2
ultralytics/models/sam/modules/sam.py +150 -6
ultralytics/models/sam/modules/utils.py +134 -4
ultralytics/models/sam/predict.py +2076 -118
ultralytics/models/sam/sam3/__init__.py +3 -0
ultralytics/models/sam/sam3/decoder.py +546 -0
ultralytics/models/sam/sam3/encoder.py +535 -0
ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
ultralytics/models/sam/sam3/model_misc.py +198 -0
ultralytics/models/sam/sam3/necks.py +129 -0
ultralytics/models/sam/sam3/sam3_image.py +357 -0
ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
ultralytics/models/sam/sam3/tokenizer_ve.py +242 -0
ultralytics/models/sam/sam3/vitdet.py +546 -0
ultralytics/models/sam/sam3/vl_combiner.py +165 -0
ultralytics/models/yolo/obb/val.py +18 -7
ultralytics/nn/modules/transformer.py +21 -1
ultralytics/utils/checks.py +2 -2
ultralytics/utils/ops.py +1 -3
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/sam3/vl_combiner.py ADDED Viewed

@@ -0,0 +1,165 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""Provides utility to combine a vision backbone with a language backbone."""
+from __future__ import annotations
+from copy import copy
+import torch
+import torch.nn as nn
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from .necks import Sam3DualViTDetNeck
+class SAM3VLBackbone(nn.Module):
+    """This backbone combines a vision backbone and a language backbone without fusion. As such it is more of a
+    convenience wrapper to handle the two backbones together.
+    It adds support for activation checkpointing and compilation.
+    """
+    def __init__(
+        self,
+        visual: Sam3DualViTDetNeck,
+        text,
+        compile_visual: bool = False,
+        act_ckpt_whole_vision_backbone: bool = False,
+        act_ckpt_whole_language_backbone: bool = False,
+        scalp=0,
+    ):
+        """Initialize the backbone combiner.
+        :param visual: The vision backbone to use
+        :param text: The text encoder to use
+        """
+        super().__init__()
+        self.vision_backbone: Sam3DualViTDetNeck = torch.compile(visual) if compile_visual else visual
+        self.language_backbone = text
+        self.scalp = scalp
+        # allow running activation checkpointing on the entire vision and language backbones
+        self.act_ckpt_whole_vision_backbone = act_ckpt_whole_vision_backbone
+        self.act_ckpt_whole_language_backbone = act_ckpt_whole_language_backbone
+    def forward(
+        self,
+        samples: torch.Tensor,
+        captions: list[str],
+        input_boxes: torch.Tensor = None,
+        additional_text: list[str] | None = None,
+    ):
+        """Forward pass of the backbone combiner.
+        :param samples: The input images
+        :param captions: The input captions
+        :param input_boxes: If the text contains place-holders for boxes, this
+            parameter contains the tensor containing their spatial features
+        :param additional_text: This can be used to encode some additional text
+            (different from the captions) in the same forward of the backbone
+        :return: Output dictionary with the following keys:
+            - vision_features: The output of the vision backbone
+            - language_features: The output of the language backbone
+            - language_mask: The attention mask of the language backbone
+            - vision_pos_enc: The positional encoding of the vision backbone
+            - (optional) additional_text_features: The output of the language
+                backbone for the additional text
+            - (optional) additional_text_mask: The attention mask of the
+                language backbone for the additional text
+        """
+        output = self.forward_image(samples)
+        output.update(self.forward_text(captions, input_boxes, additional_text))
+        return output
+    def forward_image(self, samples: torch.Tensor):
+        """Forward pass of the vision backbone and get both SAM3 and SAM2 features."""
+        # Forward through backbone
+        sam3_features, sam3_pos, sam2_features, sam2_pos = self.vision_backbone.forward(samples)
+        if self.scalp > 0:
+            # Discard the lowest resolution features
+            sam3_features, sam3_pos = (
+                sam3_features[: -self.scalp],
+                sam3_pos[: -self.scalp],
+            )
+            if sam2_features is not None and sam2_pos is not None:
+                sam2_features, sam2_pos = (
+                    sam2_features[: -self.scalp],
+                    sam2_pos[: -self.scalp],
+                )
+        sam2_output = None
+        if sam2_features is not None and sam2_pos is not None:
+            sam2_src = sam2_features[-1]
+            sam2_output = {
+                "vision_features": sam2_src,
+                "vision_pos_enc": sam2_pos,
+                "backbone_fpn": sam2_features,
+            }
+        sam3_src = sam3_features[-1]
+        return {
+            "vision_features": sam3_src,
+            "vision_pos_enc": sam3_pos,
+            "backbone_fpn": sam3_features,
+            "sam2_backbone_out": sam2_output,
+        }
+    def forward_image_sam2(self, samples: torch.Tensor):
+        """Forward pass of the vision backbone to get SAM2 features only."""
+        xs = self.vision_backbone.trunk(samples)
+        sam2_features, sam2_pos = [], []
+        x = xs[-1]  # simpleFPN
+        assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
+        for i in range(len(self.vision_backbone.sam2_convs)):
+            sam2_x_out = self.vision_backbone.sam2_convs[i](x)
+            sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
+            sam2_features.append(sam2_x_out)
+            sam2_pos.append(sam2_pos_out)
+        if self.scalp > 0:
+            # Discard the lowest resolution features
+            sam2_features, sam2_pos = (
+                sam2_features[: -self.scalp],
+                sam2_pos[: -self.scalp],
+            )
+        return {
+            "vision_features": sam2_features[-1],
+            "vision_pos_enc": sam2_pos,
+            "backbone_fpn": sam2_features,
+        }
+    def forward_text(self, captions, input_boxes=None, additional_text=None):
+        """Forward pass of the text encoder."""
+        output = {}
+        # Forward through text_encoder
+        text_to_encode = copy(captions)
+        if additional_text is not None:
+            # if there are additional_text, we piggy-back them into this forward.
+            # They'll be used later for output alignment
+            text_to_encode += additional_text
+        with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]):
+            text_attention_mask, text_memory, text_embeds = self.language_backbone(text_to_encode, input_boxes)
+        if additional_text is not None:
+            output["additional_text_features"] = text_memory[:, -len(additional_text) :]
+            output["additional_text_mask"] = text_attention_mask[-len(additional_text) :]
+        text_memory = text_memory[:, : len(captions)]
+        text_attention_mask = text_attention_mask[: len(captions)]
+        text_embeds = text_embeds[:, : len(captions)]
+        output["language_features"] = text_memory
+        output["language_mask"] = text_attention_mask
+        output["language_embeds"] = text_embeds  # Text embeddings before forward to the encoder
+        return output
+    def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
+        """Set the image size for the vision backbone."""
+        self.vision_backbone.set_imgsz(imgsz)

ultralytics/models/yolo/obb/val.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ultralytics.models.yolo.detect import DetectionValidator
 from ultralytics.utils import LOGGER, ops
 from ultralytics.utils.metrics import OBBMetrics, batch_probiou
 from ultralytics.utils.nms import TorchNMS
+from ultralytics.utils.plotting import plot_images
 class OBBValidator(DetectionValidator):
@@ -141,24 +142,34 @@ class OBBValidator(DetectionValidator):
             "im_file": batch["im_file"][si],
         }
-    def plot_predictions(self, batch: dict[str, Any], preds: list[torch.Tensor], ni: int) -> None:
+    def plot_predictions(self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int) -> None:
         """Plot predicted bounding boxes on input images and save the result.
         Args:
             batch (dict[str, Any]): Batch data containing images, file paths, and other metadata.
-            preds (list[torch.Tensor]): List of prediction tensors for each image in the batch.
+            preds (list[dict[str, torch.Tensor]]): List of prediction dictionaries for each image in the batch.
             ni (int): Batch index used for naming the output file.
         Examples:
             >>> validator = OBBValidator()
             >>> batch = {"img": images, "im_file": paths}
-            >>> preds = [torch.rand(10, 7)]  # Example predictions for one image
+            >>> preds = [{"bboxes": torch.rand(10, 5), "cls": torch.zeros(10), "conf": torch.rand(10)}]
             >>> validator.plot_predictions(batch, preds, 0)
         """
-        for p in preds:
-            # TODO: fix this duplicated `xywh2xyxy`
-            p["bboxes"][:, :4] = ops.xywh2xyxy(p["bboxes"][:, :4])  # convert to xyxy format for plotting
-        super().plot_predictions(batch, preds, ni)  # plot bboxes
+        if not preds:
+            return
+        for i, pred in enumerate(preds):
+            pred["batch_idx"] = torch.ones_like(pred["conf"]) * i
+        keys = preds[0].keys()
+        batched_preds = {k: torch.cat([x[k] for x in preds], dim=0) for k in keys}
+        plot_images(
+            images=batch["img"],
+            labels=batched_preds,
+            paths=batch["im_file"],
+            fname=self.save_dir / f"val_batch{ni}_pred.jpg",
+            names=self.names,
+            on_plot=self.on_plot,
+        )
     def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
         """Convert YOLO predictions to COCO JSON format with rotated bounding box information.

ultralytics/nn/modules/transformer.py CHANGED Viewed

@@ -359,7 +359,15 @@ class MLP(nn.Module):
     """
     def __init__(
-        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act=nn.ReLU,
+        sigmoid: bool = False,
+        residual: bool = False,
+        out_norm: nn.Module = None,
     ):
         """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
@@ -370,6 +378,8 @@ class MLP(nn.Module):
             num_layers (int): Number of layers.
             act (nn.Module): Activation function.
             sigmoid (bool): Whether to apply sigmoid to the output.
+            residual (bool): Whether to use residual connections.
+            out_norm (nn.Module, optional): Normalization layer for the output.
         """
         super().__init__()
         self.num_layers = num_layers
@@ -377,6 +387,12 @@ class MLP(nn.Module):
         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
         self.sigmoid = sigmoid
         self.act = act()
+        if residual and input_dim != output_dim:
+            raise ValueError("residual is only supported if input_dim == output_dim")
+        self.residual = residual
+        # whether to apply a normalization layer to the output
+        assert isinstance(out_norm, nn.Module) or out_norm is None
+        self.out_norm = out_norm or nn.Identity()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass for the entire MLP.
@@ -387,8 +403,12 @@ class MLP(nn.Module):
         Returns:
             (torch.Tensor): Output tensor after MLP.
         """
+        orig_x = x
         for i, layer in enumerate(self.layers):
             x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if getattr(self, "residual", False):
+            x = x + orig_x
+        x = getattr(self, "out_norm", nn.Identity())(x)
         return x.sigmoid() if getattr(self, "sigmoid", False) else x

ultralytics/utils/checks.py CHANGED Viewed

@@ -379,8 +379,8 @@ def check_apt_requirements(requirements):
             f"{prefix} Ultralytics requirement{'s' * (len(missing_packages) > 1)} {missing_packages} not found, attempting AutoUpdate..."
         )
         # Optionally update package list first
-        if is_sudo_available():
-            subprocess.run(["sudo", "apt", "update"], check=False)
+        cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "update"]
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
         # Build and run the install command
         cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "install", "-y"] + missing_packages

ultralytics/utils/ops.py CHANGED Viewed

@@ -660,6 +660,4 @@ def clean_str(s):
 def empty_like(x):
     """Create empty torch.Tensor or np.ndarray with same shape as input and float32 dtype."""
-    return (
-        torch.empty_like(x, dtype=torch.float32) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=np.float32)
-    )
+    return torch.empty_like(x, dtype=x.dtype) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=x.dtype)

{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL RENAMED Viewed

File without changes

{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt RENAMED Viewed

File without changes

dgenerate-ultralytics-headless 8.3.236__py3-none-any.whl → 8.3.237__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.236py3-none-any.whl → 8.3.237py3-none-any.whl