PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.236__py3-none-any.whl → 8.3.237__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.236py3-none-any.whl → 8.3.237py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/METADATA +1 -1
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/RECORD +38 -25
ultralytics/__init__.py +1 -1
ultralytics/engine/exporter.py +17 -10
ultralytics/engine/predictor.py +3 -2
ultralytics/engine/trainer.py +8 -0
ultralytics/models/rtdetr/val.py +5 -1
ultralytics/models/sam/__init__.py +14 -1
ultralytics/models/sam/build.py +17 -8
ultralytics/models/sam/build_sam3.py +374 -0
ultralytics/models/sam/model.py +12 -4
ultralytics/models/sam/modules/blocks.py +20 -8
ultralytics/models/sam/modules/decoders.py +2 -3
ultralytics/models/sam/modules/encoders.py +4 -1
ultralytics/models/sam/modules/memory_attention.py +6 -2
ultralytics/models/sam/modules/sam.py +150 -6
ultralytics/models/sam/modules/utils.py +134 -4
ultralytics/models/sam/predict.py +2076 -118
ultralytics/models/sam/sam3/__init__.py +3 -0
ultralytics/models/sam/sam3/decoder.py +546 -0
ultralytics/models/sam/sam3/encoder.py +535 -0
ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
ultralytics/models/sam/sam3/model_misc.py +198 -0
ultralytics/models/sam/sam3/necks.py +129 -0
ultralytics/models/sam/sam3/sam3_image.py +357 -0
ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
ultralytics/models/sam/sam3/tokenizer_ve.py +242 -0
ultralytics/models/sam/sam3/vitdet.py +546 -0
ultralytics/models/sam/sam3/vl_combiner.py +165 -0
ultralytics/models/yolo/obb/val.py +18 -7
ultralytics/nn/modules/transformer.py +21 -1
ultralytics/utils/checks.py +2 -2
ultralytics/utils/ops.py +1 -3
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/sam3/model_misc.py ADDED Viewed

@@ -0,0 +1,198 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""Various utility models."""
+from __future__ import annotations
+import math
+import numpy as np
+import torch
+from torch import Tensor, nn
+class DotProductScoring(torch.nn.Module):
+    """A module that computes dot-product scores between a set of query features and a."""
+    def __init__(
+        self,
+        d_model,
+        d_proj,
+        prompt_mlp=None,
+        clamp_logits=True,
+        clamp_max_val=12.0,
+    ):
+        """Initialize the DotProductScoring module."""
+        super().__init__()
+        self.d_proj = d_proj
+        assert isinstance(prompt_mlp, torch.nn.Module) or prompt_mlp is None
+        self.prompt_mlp = prompt_mlp  # an optional MLP projection for prompt
+        self.prompt_proj = torch.nn.Linear(d_model, d_proj)
+        self.hs_proj = torch.nn.Linear(d_model, d_proj)
+        self.scale = float(1.0 / np.sqrt(d_proj))
+        self.clamp_logits = clamp_logits
+        if self.clamp_logits:
+            self.clamp_max_val = clamp_max_val
+    def mean_pool_text(self, prompt, prompt_mask):
+        """Mean-pool the prompt embeddings over the valid tokens only."""
+        # is_valid has shape (seq, bs, 1), where 1 is valid and 0 is padding
+        is_valid = (~prompt_mask).to(prompt.dtype).permute(1, 0)[..., None]
+        # num_valid has shape (bs, 1)
+        num_valid = torch.clamp(torch.sum(is_valid, dim=0), min=1.0)
+        # mean pool over all the valid tokens -- pooled_prompt has shape (bs, proj_dim)
+        pooled_prompt = (prompt * is_valid).sum(dim=0) / num_valid
+        return pooled_prompt
+    def forward(self, hs, prompt, prompt_mask):
+        """Compute dot-product scores between hs and prompt."""
+        # hs has shape (num_layer, bs, num_query, d_model)
+        # prompt has shape (seq, bs, d_model)
+        # prompt_mask has shape (bs, seq), where 1 is valid and 0 is padding
+        assert hs.dim() == 4 and prompt.dim() == 3 and prompt_mask.dim() == 2
+        # apply MLP on prompt if specified
+        if self.prompt_mlp is not None:
+            prompt = self.prompt_mlp(prompt.to(hs.dtype))
+        # first, get the mean-pooled version of the prompt
+        pooled_prompt = self.mean_pool_text(prompt, prompt_mask)
+        # then, project pooled_prompt and hs to d_proj dimensions
+        proj_pooled_prompt = self.prompt_proj(pooled_prompt)  # (bs, d_proj)
+        proj_hs = self.hs_proj(hs)  # (num_layer, bs, num_query, d_proj)
+        # finally, get dot-product scores of shape (num_layer, bs, num_query, 1)
+        scores = torch.matmul(proj_hs, proj_pooled_prompt.unsqueeze(-1))
+        scores *= self.scale
+        # clamp scores to a max value to avoid numerical issues in loss or matcher
+        if self.clamp_logits:
+            scores.clamp_(min=-self.clamp_max_val, max=self.clamp_max_val)
+        return scores
+class LayerScale(nn.Module):
+    """LayerScale module as introduced in "Meta Pseudo Labels" and used in."""
+    def __init__(
+        self,
+        dim: int,
+        init_values: float | Tensor = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        """Initialize the LayerScale module."""
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply LayerScale to the input tensor."""
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class TransformerWrapper(nn.Module):
+    """A wrapper for the transformer consisting of an encoder and a decoder."""
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        d_model: int,
+        two_stage_type="none",  # ["none"] only for now
+        pos_enc_at_input_dec=True,
+    ):
+        """Initialize the TransformerWrapper."""
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.num_queries = decoder.num_queries if decoder is not None else None
+        self.pos_enc_at_input_dec = pos_enc_at_input_dec
+        # for two stage
+        assert two_stage_type in ["none"], f"unknown param {two_stage_type} of two_stage_type"
+        self.two_stage_type = two_stage_type
+        self._reset_parameters()
+        self.d_model = d_model
+    def _reset_parameters(self):
+        """Initialize the parameters of the model."""
+        for n, p in self.named_parameters():
+            if p.dim() > 1:
+                if "box_embed" not in n and "query_embed" not in n and "reference_points" not in n:
+                    nn.init.xavier_uniform_(p)
+def get_valid_ratio(mask):
+    """Compute the valid ratio of height and width from the mask."""
+    _, H, W = mask.shape
+    valid_H = torch.sum(~mask[:, :, 0], 1)
+    valid_W = torch.sum(~mask[:, 0, :], 1)
+    valid_ratio_h = valid_H.float() / H
+    valid_ratio_w = valid_W.float() / W
+    valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+    return valid_ratio
+def gen_sineembed_for_position(pos_tensor: torch.Tensor, num_feats: int = 256):
+    """Generate sinusoidal position embeddings for 2D or 4D coordinate tensors.
+    This function creates sinusoidal embeddings using sine and cosine functions at different frequencies, similar to the
+    positional encoding used in Transformer models. It supports both 2D position tensors (x, y) and 4D tensors (x, y, w,
+    h) for bounding box coordinates.
+    Args:
+        pos_tensor (torch.Tensor): Input position tensor of shape (n_query, bs, 2) for 2D coordinates or (n_query, bs,
+            4) for 4D coordinates (bounding boxes).
+        num_feats (int): Number of feature dimensions for the output embedding. Must be even. Defaults to 256.
+    Returns:
+        (torch.Tensor): Sinusoidal position embeddings of shape (n_query, bs, num_feats) for 2D input or (n_query, bs,
+            num_feats * 2) for 4D input.
+    Raises:
+        AssertionError: If num_feats is not even.
+        ValueError: If pos_tensor.size(-1) is not 2 or 4.
+    Examples:
+        >>> pos_2d = torch.rand(100, 8, 2)  # 100 queries, batch size 8, 2D coordinates
+        >>> embeddings_2d = gen_sineembed_for_position(pos_2d, num_feats=256)
+        >>> embeddings_2d.shape
+        torch.Size([100, 8, 256])
+        >>> pos_4d = torch.rand(50, 4, 4)  # 50 queries, batch size 4, 4D coordinates
+        >>> embeddings_4d = gen_sineembed_for_position(pos_4d, num_feats=128)
+        >>> embeddings_4d.shape
+        torch.Size([50, 4, 256])
+    """
+    assert num_feats % 2 == 0
+    num_feats = num_feats // 2
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_feats, dtype=pos_tensor.dtype, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode="floor")) / num_feats)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError(f"Unknown pos_tensor shape(-1):{pos_tensor.size(-1)}")
+    return pos

ultralytics/models/sam/sam3/necks.py ADDED Viewed

@@ -0,0 +1,129 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""Necks are the interface between a vision backbone and the rest of the detection model."""
+from __future__ import annotations
+from copy import deepcopy
+import torch
+import torch.nn as nn
+class Sam3DualViTDetNeck(nn.Module):
+    """A neck that implements a simple FPN as in ViTDet, with support for dual necks (for SAM3 and SAM2)."""
+    def __init__(
+        self,
+        trunk: nn.Module,
+        position_encoding: nn.Module,
+        d_model: int,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
+        add_sam2_neck: bool = False,
+    ):
+        """
+        SimpleFPN neck a la ViTDet
+        (From detectron2, very lightly adapted)
+        It supports a "dual neck" setting, where we have two identical necks (for SAM3 and SAM2), with different weights.
+        :param trunk: the backbone
+        :param position_encoding: the positional encoding to use
+        :param d_model: the dimension of the model
+        """
+        super().__init__()
+        self.trunk = trunk
+        self.position_encoding = position_encoding
+        self.convs = nn.ModuleList()
+        self.scale_factors = scale_factors
+        use_bias = True
+        dim: int = self.trunk.channel_list[-1]
+        for _, scale in enumerate(scale_factors):
+            current = nn.Sequential()
+            if scale == 4.0:
+                current.add_module(
+                    "dconv_2x2_0",
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                )
+                current.add_module(
+                    "gelu",
+                    nn.GELU(),
+                )
+                current.add_module(
+                    "dconv_2x2_1",
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                )
+                out_dim = dim // 4
+            elif scale == 2.0:
+                current.add_module(
+                    "dconv_2x2",
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                )
+                out_dim = dim // 2
+            elif scale == 1.0:
+                out_dim = dim
+            elif scale == 0.5:
+                current.add_module(
+                    "maxpool_2x2",
+                    nn.MaxPool2d(kernel_size=2, stride=2),
+                )
+                out_dim = dim
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            current.add_module(
+                "conv_1x1",
+                nn.Conv2d(
+                    in_channels=out_dim,
+                    out_channels=d_model,
+                    kernel_size=1,
+                    bias=use_bias,
+                ),
+            )
+            current.add_module(
+                "conv_3x3",
+                nn.Conv2d(
+                    in_channels=d_model,
+                    out_channels=d_model,
+                    kernel_size=3,
+                    padding=1,
+                    bias=use_bias,
+                ),
+            )
+            self.convs.append(current)
+        self.sam2_convs = None
+        if add_sam2_neck:
+            # Assumes sam2 neck is just a clone of the original neck
+            self.sam2_convs = deepcopy(self.convs)
+    def forward(
+        self, tensor_list: list[torch.Tensor]
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
+        """Get the feature maps and positional encodings from the neck."""
+        xs = self.trunk(tensor_list)
+        sam3_out, sam3_pos = [], []
+        sam2_out, sam2_pos = None, None
+        if self.sam2_convs is not None:
+            sam2_out, sam2_pos = [], []
+        x = xs[-1]  # simpleFPN
+        for i in range(len(self.convs)):
+            sam3_x_out = self.convs[i](x)
+            sam3_pos_out = self.position_encoding(sam3_x_out).to(sam3_x_out.dtype)
+            sam3_out.append(sam3_x_out)
+            sam3_pos.append(sam3_pos_out)
+            if self.sam2_convs is not None:
+                sam2_x_out = self.sam2_convs[i](x)
+                sam2_pos_out = self.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
+                sam2_out.append(sam2_x_out)
+                sam2_pos.append(sam2_pos_out)
+        return sam3_out, sam3_pos, sam2_out, sam2_pos
+    def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
+        """Set the image size for the trunk backbone."""
+        self.trunk.set_imgsz(imgsz)

ultralytics/models/sam/sam3/sam3_image.py ADDED Viewed

@@ -0,0 +1,357 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+from __future__ import annotations
+from copy import deepcopy
+import torch
+from ultralytics.nn.modules.utils import inverse_sigmoid
+from ultralytics.utils.ops import xywh2xyxy
+from .geometry_encoders import Prompt
+from .vl_combiner import SAM3VLBackbone
+def _update_out(out, out_name, out_value, auxiliary=True, update_aux=True):
+    """Helper function to update output dictionary with main and auxiliary outputs."""
+    out[out_name] = out_value[-1] if auxiliary else out_value
+    if auxiliary and update_aux:
+        if "aux_outputs" not in out:
+            out["aux_outputs"] = [{} for _ in range(len(out_value) - 1)]
+        assert len(out["aux_outputs"]) == len(out_value) - 1
+        for aux_output, aux_value in zip(out["aux_outputs"], out_value[:-1]):
+            aux_output[out_name] = aux_value
+class SAM3SemanticModel(torch.nn.Module):
+    """SAM3 model for semantic segmentation with vision-language backbone."""
+    def __init__(
+        self,
+        backbone: SAM3VLBackbone,
+        transformer,
+        input_geometry_encoder,
+        segmentation_head=None,
+        num_feature_levels=1,
+        o2m_mask_predict=True,
+        dot_prod_scoring=None,
+        use_instance_query: bool = True,
+        multimask_output: bool = True,
+        use_act_checkpoint_seg_head: bool = True,
+        matcher=None,
+        use_dot_prod_scoring=True,
+        supervise_joint_box_scores: bool = False,  # only relevant if using presence token/score
+        detach_presence_in_joint_score: bool = False,  # only relevant if using presence token/score
+        separate_scorer_for_instance: bool = False,
+        num_interactive_steps_val: int = 0,
+    ):
+        """Initialize the SAM3SemanticModel."""
+        super().__init__()
+        self.backbone = backbone
+        self.geometry_encoder = input_geometry_encoder
+        self.transformer = transformer
+        self.hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.segmentation_head = segmentation_head
+        self.o2m_mask_predict = o2m_mask_predict
+        self.dot_prod_scoring = dot_prod_scoring
+        self.use_act_checkpoint_seg_head = use_act_checkpoint_seg_head
+        self.matcher = matcher
+        self.num_interactive_steps_val = num_interactive_steps_val
+        self.use_dot_prod_scoring = use_dot_prod_scoring
+        if self.use_dot_prod_scoring:
+            assert dot_prod_scoring is not None
+            self.dot_prod_scoring = dot_prod_scoring
+            self.instance_dot_prod_scoring = None
+            if separate_scorer_for_instance:
+                self.instance_dot_prod_scoring = deepcopy(dot_prod_scoring)
+        else:
+            self.class_embed = torch.nn.Linear(self.hidden_dim, 1)
+            self.instance_class_embed = None
+            if separate_scorer_for_instance:
+                self.instance_class_embed = deepcopy(self.class_embed)
+        self.supervise_joint_box_scores = supervise_joint_box_scores
+        self.detach_presence_in_joint_score = detach_presence_in_joint_score
+        # verify the number of queries for O2O and O2M
+        num_o2o_static = self.transformer.decoder.num_queries
+        num_o2m_static = self.transformer.decoder.num_o2m_queries
+        assert num_o2m_static == (num_o2o_static if self.transformer.decoder.dac else 0)
+        self.dac = self.transformer.decoder.dac
+        self.use_instance_query = use_instance_query
+        self.multimask_output = multimask_output
+        self.text_embeddings = {}
+        self.names = []
+    def _prepare_backbone_features(self, backbone_out, num_prompts=1):
+        """Prepare and flatten visual features from the image backbone output for further processing."""
+        if num_prompts > 1:  # expand features if there's more than one prompt
+            for i, feat in enumerate(backbone_out["backbone_fpn"]):
+                backbone_out["backbone_fpn"][i] = feat.expand(num_prompts, -1, -1, -1)
+            for i, pos in enumerate(backbone_out["vision_pos_enc"]):
+                pos = pos.expand(num_prompts, -1, -1, -1)
+                backbone_out["vision_pos_enc"][i] = pos
+        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
+        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
+        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
+        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+        # flatten NxCxHxW to HWxNxC
+        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
+        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
+        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
+    def _encode_prompt(
+        self,
+        img_feats,
+        img_pos_embeds,
+        vis_feat_sizes,
+        geometric_prompt,
+        visual_prompt_embed=None,
+        visual_prompt_mask=None,
+        prev_mask_pred=None,
+    ):
+        """Encode the geometric and visual prompts."""
+        if prev_mask_pred is not None:
+            img_feats = [img_feats[-1] + prev_mask_pred]
+        # Encode geometry
+        geo_feats, geo_masks = self.geometry_encoder(
+            geo_prompt=geometric_prompt,
+            img_feats=img_feats,
+            img_sizes=vis_feat_sizes,
+            img_pos_embeds=img_pos_embeds,
+        )
+        if visual_prompt_embed is None:
+            visual_prompt_embed = torch.zeros((0, *geo_feats.shape[1:]), device=geo_feats.device)
+            visual_prompt_mask = torch.zeros(
+                (*geo_masks.shape[:-1], 0),
+                device=geo_masks.device,
+                dtype=geo_masks.dtype,
+            )
+        prompt = torch.cat([geo_feats, visual_prompt_embed], dim=0)
+        prompt_mask = torch.cat([geo_masks, visual_prompt_mask], dim=1)
+        return prompt, prompt_mask
+    def _run_encoder(
+        self,
+        img_feats,
+        img_pos_embeds,
+        vis_feat_sizes,
+        prompt,
+        prompt_mask,
+        encoder_extra_kwargs: dict | None = None,
+    ):
+        """Run the transformer encoder."""
+        # Run the encoder
+        # make a copy of the image feature lists since the encoder may modify these lists in-place
+        memory = self.transformer.encoder(
+            src=img_feats.copy(),
+            src_key_padding_mask=None,
+            src_pos=img_pos_embeds.copy(),
+            prompt=prompt,
+            prompt_key_padding_mask=prompt_mask,
+            feat_sizes=vis_feat_sizes,
+            encoder_extra_kwargs=encoder_extra_kwargs,
+        )
+        encoder_out = {
+            # encoded image features
+            "encoder_hidden_states": memory["memory"],
+            "pos_embed": memory["pos_embed"],
+            "padding_mask": memory["padding_mask"],
+            "spatial_shapes": memory["spatial_shapes"],
+            "valid_ratios": memory["valid_ratios"],
+            "vis_feat_sizes": vis_feat_sizes,
+            # encoded text features (or other prompts)
+            "prompt_before_enc": prompt,
+            "prompt_after_enc": memory.get("memory_text", prompt),
+            "prompt_mask": prompt_mask,
+        }
+        return encoder_out
+    def _run_decoder(
+        self,
+        pos_embed,
+        memory,
+        src_mask,
+        out,
+        prompt,
+        prompt_mask,
+        encoder_out,
+    ):
+        """Run the transformer decoder."""
+        bs = memory.shape[1]
+        query_embed = self.transformer.decoder.query_embed.weight
+        tgt = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        hs, reference_boxes, dec_presence_out, _ = self.transformer.decoder(
+            tgt=tgt,
+            memory=memory,
+            memory_key_padding_mask=src_mask,
+            pos=pos_embed,
+            reference_boxes=None,
+            spatial_shapes=encoder_out["spatial_shapes"],
+            valid_ratios=encoder_out["valid_ratios"],
+            tgt_mask=None,
+            memory_text=prompt,
+            text_attention_mask=prompt_mask,
+            apply_dac=False,
+        )
+        hs = hs.transpose(1, 2)  # seq-first to batch-first
+        reference_boxes = reference_boxes.transpose(1, 2)  # seq-first to batch-first
+        if dec_presence_out is not None:
+            # seq-first to batch-first
+            dec_presence_out = dec_presence_out.transpose(1, 2)
+        self._update_scores_and_boxes(
+            out,
+            hs,
+            reference_boxes,
+            prompt,
+            prompt_mask,
+            dec_presence_out=dec_presence_out,
+        )
+        return out, hs
+    def _update_scores_and_boxes(
+        self,
+        out,
+        hs,
+        reference_boxes,
+        prompt,
+        prompt_mask,
+        dec_presence_out=None,
+        is_instance_prompt=False,
+    ):
+        """Update output dict with class scores and box predictions."""
+        num_o2o = hs.size(2)
+        # score prediction
+        if self.use_dot_prod_scoring:
+            dot_prod_scoring_head = self.dot_prod_scoring
+            if is_instance_prompt and self.instance_dot_prod_scoring is not None:
+                dot_prod_scoring_head = self.instance_dot_prod_scoring
+            outputs_class = dot_prod_scoring_head(hs, prompt, prompt_mask)
+        else:
+            class_embed_head = self.class_embed
+            if is_instance_prompt and self.instance_class_embed is not None:
+                class_embed_head = self.instance_class_embed
+            outputs_class = class_embed_head(hs)
+        # box prediction
+        box_head = self.transformer.decoder.bbox_embed
+        if is_instance_prompt and self.transformer.decoder.instance_bbox_embed is not None:
+            box_head = self.transformer.decoder.instance_bbox_embed
+        anchor_box_offsets = box_head(hs)
+        reference_boxes_inv_sig = inverse_sigmoid(reference_boxes)
+        outputs_coord = (reference_boxes_inv_sig + anchor_box_offsets).sigmoid()
+        outputs_boxes_xyxy = xywh2xyxy(outputs_coord)
+        if dec_presence_out is not None:
+            _update_out(out, "presence_logit_dec", dec_presence_out, update_aux=False)
+        if self.supervise_joint_box_scores:
+            assert dec_presence_out is not None
+            prob_dec_presence_out = dec_presence_out.clone().sigmoid()
+            if self.detach_presence_in_joint_score:
+                prob_dec_presence_out = prob_dec_presence_out.detach()
+            outputs_class = inverse_sigmoid(outputs_class.sigmoid() * prob_dec_presence_out.unsqueeze(2)).clamp(
+                min=-10.0, max=10.0
+            )
+        _update_out(out, "pred_logits", outputs_class[:, :, :num_o2o], update_aux=False)
+        _update_out(out, "pred_boxes", outputs_coord[:, :, :num_o2o], update_aux=False)
+        _update_out(out, "pred_boxes_xyxy", outputs_boxes_xyxy[:, :, :num_o2o], update_aux=False)
+    def _run_segmentation_heads(
+        self,
+        out,
+        backbone_out,
+        encoder_hidden_states,
+        prompt,
+        prompt_mask,
+        hs,
+    ):
+        """Run segmentation heads and get masks."""
+        if self.segmentation_head is not None:
+            num_o2o = hs.size(2)
+            obj_queries = hs if self.o2m_mask_predict else hs[:, :, :num_o2o]
+            seg_head_outputs = self.segmentation_head(
+                backbone_feats=backbone_out["backbone_fpn"],
+                obj_queries=obj_queries,
+                encoder_hidden_states=encoder_hidden_states,
+                prompt=prompt,
+                prompt_mask=prompt_mask,
+            )
+            for k, v in seg_head_outputs.items():
+                if k in self.segmentation_head.instance_keys:
+                    _update_out(out, k, v[:, :num_o2o], auxiliary=False)
+                else:
+                    out[k] = v
+        else:
+            backbone_out.pop("backbone_fpn", None)
+    def forward_grounding(
+        self, backbone_out: dict[str, torch.Tensor], text_ids: torch.Tensor, geometric_prompt: Prompt = None
+    ):
+        """Forward pass for grounding (detection + segmentation) given input images and text."""
+        backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = self._prepare_backbone_features(
+            backbone_out, num_prompts=len(text_ids)
+        )
+        backbone_out.update({k: v for k, v in self.text_embeddings.items()})
+        with torch.profiler.record_function("SAM3Image._encode_prompt"):
+            prompt, prompt_mask = self._encode_prompt(img_feats, img_pos_embeds, vis_feat_sizes, geometric_prompt)
+        # index text features (note that regardless of early or late fusion, the batch size of
+        # `txt_feats` is always the number of *prompts* in the encoder)
+        txt_feats = backbone_out["language_features"][:, text_ids]
+        txt_masks = backbone_out["language_mask"][text_ids]
+        # encode text
+        prompt = torch.cat([txt_feats, prompt], dim=0)
+        prompt_mask = torch.cat([txt_masks, prompt_mask], dim=1)
+        # Run the encoder
+        with torch.profiler.record_function("SAM3Image._run_encoder"):
+            encoder_out = self._run_encoder(img_feats, img_pos_embeds, vis_feat_sizes, prompt, prompt_mask)
+        out = {"backbone_out": backbone_out}
+        # Run the decoder
+        with torch.profiler.record_function("SAM3Image._run_decoder"):
+            out, hs = self._run_decoder(
+                memory=encoder_out["encoder_hidden_states"],
+                pos_embed=encoder_out["pos_embed"],
+                src_mask=encoder_out["padding_mask"],
+                out=out,
+                prompt=prompt,
+                prompt_mask=prompt_mask,
+                encoder_out=encoder_out,
+            )
+        # Run segmentation heads
+        with torch.profiler.record_function("SAM3Image._run_segmentation_heads"):
+            self._run_segmentation_heads(
+                out=out,
+                backbone_out=backbone_out,
+                encoder_hidden_states=encoder_out["encoder_hidden_states"],
+                prompt=prompt,
+                prompt_mask=prompt_mask,
+                hs=hs,
+            )
+        return out
+    def set_classes(self, text: list[str]):
+        """Set the text embeddings for the given class names."""
+        self.text_embeddings = self.backbone.forward_text(text)
+        self.names = text
+    def set_imgsz(self, imgsz: tuple[int, int]):
+        """Set the image size for the model."""
+        self.backbone.set_imgsz(imgsz)

dgenerate-ultralytics-headless 8.3.236__py3-none-any.whl → 8.3.237__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.236py3-none-any.whl → 8.3.237py3-none-any.whl