PyPI - ultralytics - Versions diffs - 8.2.68__py3-none-any.whl → 8.2.70__py3-none-any.whl - Mend

ultralytics 8.2.68py3-none-any.whl → 8.2.70py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultralytics might be problematic. Click here for more details.

Files changed (37) hide show

tests/test_cli.py +4 -16
ultralytics/__init__.py +3 -2
ultralytics/cfg/__init__.py +4 -0
ultralytics/data/augment.py +1 -1
ultralytics/hub/google/__init__.py +3 -3
ultralytics/models/__init__.py +2 -1
ultralytics/models/fastsam/__init__.py +1 -2
ultralytics/models/fastsam/model.py +18 -0
ultralytics/models/fastsam/predict.py +116 -1
ultralytics/models/sam/build.py +2 -2
ultralytics/models/sam/model.py +10 -2
ultralytics/models/sam/modules/decoders.py +1 -42
ultralytics/models/sam/modules/encoders.py +3 -1
ultralytics/models/sam/modules/sam.py +5 -7
ultralytics/models/sam/modules/transformer.py +4 -3
ultralytics/models/sam/predict.py +12 -6
ultralytics/models/sam2/__init__.py +6 -0
ultralytics/models/sam2/build.py +156 -0
ultralytics/models/sam2/model.py +97 -0
ultralytics/models/sam2/modules/__init__.py +1 -0
ultralytics/models/sam2/modules/decoders.py +305 -0
ultralytics/models/sam2/modules/encoders.py +332 -0
ultralytics/models/sam2/modules/memory_attention.py +170 -0
ultralytics/models/sam2/modules/sam2.py +804 -0
ultralytics/models/sam2/modules/sam2_blocks.py +715 -0
ultralytics/models/sam2/modules/utils.py +191 -0
ultralytics/models/sam2/predict.py +182 -0
ultralytics/nn/modules/transformer.py +5 -3
ultralytics/utils/ops.py +1 -1
ultralytics/utils/torch_utils.py +9 -6
{ultralytics-8.2.68.dist-info → ultralytics-8.2.70.dist-info}/METADATA +1 -1
{ultralytics-8.2.68.dist-info → ultralytics-8.2.70.dist-info}/RECORD +36 -26
{ultralytics-8.2.68.dist-info → ultralytics-8.2.70.dist-info}/WHEEL +1 -1
ultralytics/models/fastsam/prompt.py +0 -352
{ultralytics-8.2.68.dist-info → ultralytics-8.2.70.dist-info}/LICENSE +0 -0
{ultralytics-8.2.68.dist-info → ultralytics-8.2.70.dist-info}/entry_points.txt +0 -0
{ultralytics-8.2.68.dist-info → ultralytics-8.2.70.dist-info}/top_level.txt +0 -0

tests/test_cli.py CHANGED Viewed

@@ -68,7 +68,6 @@ def test_fastsam(task="segment", model=WEIGHTS_DIR / "FastSAM-s.pt", data="coco8
     run(f"yolo segment predict model={model} source={source} imgsz=32 save save_crop save_txt")
     from ultralytics import FastSAM
-    from ultralytics.models.fastsam import FastSAMPrompt
     from ultralytics.models.sam import Predictor
     # Create a FastSAM model
@@ -81,21 +80,10 @@ def test_fastsam(task="segment", model=WEIGHTS_DIR / "FastSAM-s.pt", data="coco8
         # Remove small regions
         new_masks, _ = Predictor.remove_small_regions(everything_results[0].masks.data, min_area=20)
-        # Everything prompt
-        prompt_process = FastSAMPrompt(s, everything_results, device="cpu")
-        ann = prompt_process.everything_prompt()
-        # Bbox default shape [0,0,0,0] -> [x1,y1,x2,y2]
-        ann = prompt_process.box_prompt(bbox=[200, 200, 300, 300])
-        # Text prompt
-        ann = prompt_process.text_prompt(text="a photo of a dog")
-        # Point prompt
-        # Points default [[0,0]] [[x1,y1],[x2,y2]]
-        # Point_label default [0] [1,0] 0:background, 1:foreground
-        ann = prompt_process.point_prompt(points=[[200, 200]], pointlabel=[1])
-        prompt_process.plot(annotations=ann, output="./")
+        # Run inference with bboxes and points and texts prompt at the same time
+        results = sam_model(
+            source, bboxes=[439, 437, 524, 709], points=[[200, 200]], labels=[1], texts="a photo of a dog"
+        )
 def test_mobilesam():

ultralytics/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
-__version__ = "8.2.68"
+__version__ = "8.2.70"
 import os
@@ -8,7 +8,7 @@ import os
 os.environ["OMP_NUM_THREADS"] = "1"  # reduce CPU utilization during training
 from ultralytics.data.explorer.explorer import Explorer
-from ultralytics.models import NAS, RTDETR, SAM, YOLO, FastSAM, YOLOWorld
+from ultralytics.models import NAS, RTDETR, SAM, SAM2, YOLO, FastSAM, YOLOWorld
 from ultralytics.utils import ASSETS, SETTINGS
 from ultralytics.utils.checks import check_yolo as checks
 from ultralytics.utils.downloads import download
@@ -21,6 +21,7 @@ __all__ = (
     "YOLOWorld",
     "NAS",
     "SAM",
+    "SAM2",
     "FastSAM",
     "RTDETR",
     "checks",

ultralytics/cfg/__init__.py CHANGED Viewed

@@ -793,6 +793,10 @@ def entrypoint(debug=""):
         from ultralytics import FastSAM
         model = FastSAM(model)
+    elif "sam2" in stem:
+        from ultralytics import SAM2
+        model = SAM2(model)
     elif "sam" in stem:
         from ultralytics import SAM

ultralytics/data/augment.py CHANGED Viewed

@@ -2221,7 +2221,7 @@ class RandomLoadText:
         pos_labels = np.unique(cls).tolist()
         if len(pos_labels) > self.max_samples:
-            pos_labels = set(random.sample(pos_labels, k=self.max_samples))
+            pos_labels = random.sample(pos_labels, k=self.max_samples)
         neg_samples = min(min(num_classes, self.max_samples) - len(pos_labels), random.randint(*self.neg_samples))
         neg_labels = [i for i in range(num_classes) if i not in pos_labels]

ultralytics/hub/google/__init__.py CHANGED Viewed

@@ -136,14 +136,14 @@ class GCPRegions:
         sorted_results = sorted(results, key=lambda x: x[1])
         if verbose:
-            print(f"{'Region':<20} {'Location':<35} {'Tier':<5} {'Latency (ms)'}")
+            print(f"{'Region':<25} {'Location':<35} {'Tier':<5} {'Latency (ms)'}")
             for region, mean, std, min_, max_ in sorted_results:
                 tier, city, country = self.regions[region]
                 location = f"{city}, {country}"
                 if mean == float("inf"):
-                    print(f"{region:<20} {location:<35} {tier:<5} {'Timeout'}")
+                    print(f"{region:<25} {location:<35} {tier:<5} {'Timeout'}")
                 else:
-                    print(f"{region:<20} {location:<35} {tier:<5} {mean:.0f} ± {std:.0f} ({min_:.0f} - {max_:.0f})")
+                    print(f"{region:<25} {location:<35} {tier:<5} {mean:.0f} ± {std:.0f} ({min_:.0f} - {max_:.0f})")
             print(f"\nLowest latency region{'s' if top > 1 else ''}:")
             for region, mean, std, min_, max_ in sorted_results[:top]:
                 tier, city, country = self.regions[region]

ultralytics/models/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .fastsam import FastSAM
 from .nas import NAS
 from .rtdetr import RTDETR
 from .sam import SAM
+from .sam2 import SAM2
 from .yolo import YOLO, YOLOWorld
-__all__ = "YOLO", "RTDETR", "SAM", "FastSAM", "NAS", "YOLOWorld"  # allow simpler import
+__all__ = "YOLO", "RTDETR", "SAM", "FastSAM", "NAS", "YOLOWorld", "SAM2"  # allow simpler import

ultralytics/models/fastsam/__init__.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from .model import FastSAM
 from .predict import FastSAMPredictor
-from .prompt import FastSAMPrompt
 from .val import FastSAMValidator
-__all__ = "FastSAMPredictor", "FastSAM", "FastSAMPrompt", "FastSAMValidator"
+__all__ = "FastSAMPredictor", "FastSAM", "FastSAMValidator"

ultralytics/models/fastsam/model.py CHANGED Viewed

@@ -28,6 +28,24 @@ class FastSAM(Model):
         assert Path(model).suffix not in {".yaml", ".yml"}, "FastSAM models only support pre-trained models."
         super().__init__(model=model, task="segment")
+    def predict(self, source, stream=False, bboxes=None, points=None, labels=None, texts=None, **kwargs):
+        """
+        Performs segmentation prediction on the given image or video source.
+        Args:
+            source (str): Path to the image or video file, or a PIL.Image object, or a numpy.ndarray object.
+            stream (bool, optional): If True, enables real-time streaming. Defaults to False.
+            bboxes (list, optional): List of bounding box coordinates for prompted segmentation. Defaults to None.
+            points (list, optional): List of points for prompted segmentation. Defaults to None.
+            labels (list, optional): List of labels for prompted segmentation. Defaults to None.
+            texts (list, optional): List of texts for prompted segmentation. Defaults to None.
+        Returns:
+            (list): The model predictions.
+        """
+        prompts = dict(bboxes=bboxes, points=points, labels=labels, texts=texts)
+        return super().predict(source, stream, prompts=prompts, **kwargs)
     @property
     def task_map(self):
         """Returns a dictionary mapping segment task to corresponding predictor and validator classes."""

ultralytics/models/fastsam/predict.py CHANGED Viewed

@@ -1,8 +1,11 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 import torch
+from PIL import Image
 from ultralytics.models.yolo.segment import SegmentationPredictor
+from ultralytics.utils import DEFAULT_CFG, checks
 from ultralytics.utils.metrics import box_iou
+from ultralytics.utils.ops import scale_masks
 from .utils import adjust_bboxes_to_image_border
@@ -17,8 +20,17 @@ class FastSAMPredictor(SegmentationPredictor):
     class segmentation.
     """
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        """Initializes a FastSAMPredictor for fast SAM segmentation tasks in Ultralytics YOLO framework."""
+        super().__init__(cfg, overrides, _callbacks)
+        self.prompts = {}
     def postprocess(self, preds, img, orig_imgs):
         """Applies box postprocess for FastSAM predictions."""
+        bboxes = self.prompts.pop("bboxes", None)
+        points = self.prompts.pop("points", None)
+        labels = self.prompts.pop("labels", None)
+        texts = self.prompts.pop("texts", None)
         results = super().postprocess(preds, img, orig_imgs)
         for result in results:
             full_box = torch.tensor(
@@ -28,4 +40,107 @@ class FastSAMPredictor(SegmentationPredictor):
             idx = torch.nonzero(box_iou(full_box[None], boxes) > 0.9).flatten()
             if idx.numel() != 0:
                 result.boxes.xyxy[idx] = full_box
-        return results
+        return self.prompt(results, bboxes=bboxes, points=points, labels=labels, texts=texts)
+    def prompt(self, results, bboxes=None, points=None, labels=None, texts=None):
+        """
+        Internal function for image segmentation inference based on cues like bounding boxes, points, and masks.
+        Leverages SAM's specialized architecture for prompt-based, real-time segmentation.
+        Args:
+            results (Results | List[Results]): The original inference results from FastSAM models without any prompts.
+            bboxes (np.ndarray | List, optional): Bounding boxes with shape (N, 4), in XYXY format.
+            points (np.ndarray | List, optional): Points indicating object locations with shape (N, 2), in pixels.
+            labels (np.ndarray | List, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
+            texts (str | List[str], optional): Textual prompts, a list contains string objects.
+        Returns:
+            (List[Results]): The output results determined by prompts.
+        """
+        if bboxes is None and points is None and texts is None:
+            return results
+        prompt_results = []
+        if not isinstance(results, list):
+            results = [results]
+        for result in results:
+            masks = result.masks.data
+            if masks.shape[1:] != result.orig_shape:
+                masks = scale_masks(masks[None], result.orig_shape)[0]
+            # bboxes prompt
+            idx = torch.zeros(len(result), dtype=torch.bool, device=self.device)
+            if bboxes is not None:
+                bboxes = torch.as_tensor(bboxes, dtype=torch.int32, device=self.device)
+                bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes
+                bbox_areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
+                mask_areas = torch.stack([masks[:, b[1] : b[3], b[0] : b[2]].sum(dim=(1, 2)) for b in bboxes])
+                full_mask_areas = torch.sum(masks, dim=(1, 2))
+                union = bbox_areas[:, None] + full_mask_areas - mask_areas
+                idx[torch.argmax(mask_areas / union, dim=1)] = True
+            if points is not None:
+                points = torch.as_tensor(points, dtype=torch.int32, device=self.device)
+                points = points[None] if points.ndim == 1 else points
+                if labels is None:
+                    labels = torch.ones(points.shape[0])
+                labels = torch.as_tensor(labels, dtype=torch.int32, device=self.device)
+                assert len(labels) == len(
+                    points
+                ), f"Excepted `labels` got same size as `point`, but got {len(labels)} and {len(points)}"
+                point_idx = (
+                    torch.ones(len(result), dtype=torch.bool, device=self.device)
+                    if labels.sum() == 0  # all negative points
+                    else torch.zeros(len(result), dtype=torch.bool, device=self.device)
+                )
+                for p, l in zip(points, labels):
+                    point_idx[torch.nonzero(masks[:, p[1], p[0]], as_tuple=True)[0]] = True if l else False
+                idx |= point_idx
+            if texts is not None:
+                if isinstance(texts, str):
+                    texts = [texts]
+                crop_ims, filter_idx = [], []
+                for i, b in enumerate(result.boxes.xyxy.tolist()):
+                    x1, y1, x2, y2 = [int(x) for x in b]
+                    if masks[i].sum() <= 100:
+                        filter_idx.append(i)
+                        continue
+                    crop_ims.append(Image.fromarray(result.orig_img[y1:y2, x1:x2, ::-1]))
+                similarity = self._clip_inference(crop_ims, texts)
+                text_idx = torch.argmax(similarity, dim=-1)  # (M, )
+                if len(filter_idx):
+                    text_idx += (torch.tensor(filter_idx, device=self.device)[None] <= int(text_idx)).sum(0)
+                idx[text_idx] = True
+            prompt_results.append(result[idx])
+        return prompt_results
+    def _clip_inference(self, images, texts):
+        """
+        CLIP Inference process.
+        Args:
+            images (List[PIL.Image]): A list of source images and each of them should be PIL.Image type with RGB channel order.
+            texts (List[str]): A list of prompt texts and each of them should be string object.
+        Returns:
+            (torch.Tensor): The similarity between given images and texts.
+        """
+        try:
+            import clip
+        except ImportError:
+            checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
+            import clip
+        if (not hasattr(self, "clip_model")) or (not hasattr(self, "clip_preprocess")):
+            self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=self.device)
+        images = torch.stack([self.clip_preprocess(image).to(self.device) for image in images])
+        tokenized_text = clip.tokenize(texts).to(self.device)
+        image_features = self.clip_model.encode_image(images)
+        text_features = self.clip_model.encode_text(tokenized_text)
+        image_features /= image_features.norm(dim=-1, keepdim=True)  # (N, 512)
+        text_features /= text_features.norm(dim=-1, keepdim=True)  # (M, 512)
+        return (image_features * text_features[:, None]).sum(-1)  # (M, N)
+    def set_prompts(self, prompts):
+        """Set prompts in advance."""
+        self.prompts = prompts

ultralytics/models/sam/build.py CHANGED Viewed

@@ -14,7 +14,7 @@ from ultralytics.utils.downloads import attempt_download_asset
 from .modules.decoders import MaskDecoder
 from .modules.encoders import ImageEncoderViT, PromptEncoder
-from .modules.sam import Sam
+from .modules.sam import SAMModel
 from .modules.tiny_encoder import TinyViT
 from .modules.transformer import TwoWayTransformer
@@ -105,7 +105,7 @@ def _build_sam(
             out_chans=prompt_embed_dim,
         )
     )
-    sam = Sam(
+    sam = SAMModel(
         image_encoder=image_encoder,
         prompt_encoder=PromptEncoder(
             embed_dim=prompt_embed_dim,

ultralytics/models/sam/model.py CHANGED Viewed

@@ -44,6 +44,7 @@ class SAM(Model):
         """
         if model and Path(model).suffix not in {".pt", ".pth"}:
             raise NotImplementedError("SAM prediction requires pre-trained *.pt or *.pth model.")
+        self.is_sam2 = "sam2" in Path(model).stem
         super().__init__(model=model, task="segment")
     def _load(self, weights: str, task=None):
@@ -54,7 +55,12 @@ class SAM(Model):
             weights (str): Path to the weights file.
             task (str, optional): Task name. Defaults to None.
         """
-        self.model = build_sam(weights)
+        if self.is_sam2:
+            from ..sam2.build import build_sam2
+            self.model = build_sam2(weights)
+        else:
+            self.model = build_sam(weights)
     def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs):
         """
@@ -112,4 +118,6 @@ class SAM(Model):
         Returns:
             (dict): A dictionary mapping the 'segment' task to its corresponding 'Predictor'.
         """
-        return {"segment": {"predictor": Predictor}}
+        from ..sam2.predict import SAM2Predictor
+        return {"segment": {"predictor": SAM2Predictor if self.is_sam2 else Predictor}}

ultralytics/models/sam/modules/decoders.py CHANGED Viewed

@@ -4,9 +4,8 @@ from typing import List, Tuple, Type
 import torch
 from torch import nn
-from torch.nn import functional as F
-from ultralytics.nn.modules import LayerNorm2d
+from ultralytics.nn.modules import MLP, LayerNorm2d
 class MaskDecoder(nn.Module):
@@ -28,7 +27,6 @@ class MaskDecoder(nn.Module):
     def __init__(
         self,
-        *,
         transformer_dim: int,
         transformer: nn.Module,
         num_multimask_outputs: int = 3,
@@ -149,42 +147,3 @@ class MaskDecoder(nn.Module):
         iou_pred = self.iou_prediction_head(iou_token_out)
         return masks, iou_pred
-class MLP(nn.Module):
-    """
-    MLP (Multi-Layer Perceptron) model lightly adapted from
-    https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py
-    """
-    def __init__(
-        self,
-        input_dim: int,
-        hidden_dim: int,
-        output_dim: int,
-        num_layers: int,
-        sigmoid_output: bool = False,
-    ) -> None:
-        """
-        Initializes the MLP (Multi-Layer Perceptron) model.
-        Args:
-            input_dim (int): The dimensionality of the input features.
-            hidden_dim (int): The dimensionality of the hidden layers.
-            output_dim (int): The dimensionality of the output layer.
-            num_layers (int): The number of hidden layers.
-            sigmoid_output (bool, optional): Apply a sigmoid activation to the output layer. Defaults to False.
-        """
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-        self.sigmoid_output = sigmoid_output
-    def forward(self, x):
-        """Executes feedforward within the neural network module and applies activation."""
-        for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        if self.sigmoid_output:
-            x = torch.sigmoid(x)
-        return x

ultralytics/models/sam/modules/encoders.py CHANGED Viewed

@@ -211,6 +211,8 @@ class PromptEncoder(nn.Module):
         point_embedding[labels == -1] += self.not_a_point_embed.weight
         point_embedding[labels == 0] += self.point_embeddings[0].weight
         point_embedding[labels == 1] += self.point_embeddings[1].weight
+        point_embedding[labels == 2] += self.point_embeddings[2].weight
+        point_embedding[labels == 3] += self.point_embeddings[3].weight
         return point_embedding
     def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
@@ -226,8 +228,8 @@ class PromptEncoder(nn.Module):
         """Embeds mask inputs."""
         return self.mask_downscaling(masks)
+    @staticmethod
     def _get_batch_size(
-        self,
         points: Optional[Tuple[torch.Tensor, torch.Tensor]],
         boxes: Optional[torch.Tensor],
         masks: Optional[torch.Tensor],

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -15,15 +15,14 @@ from .decoders import MaskDecoder
 from .encoders import ImageEncoderViT, PromptEncoder
-class Sam(nn.Module):
+class SAMModel(nn.Module):
     """
-    Sam (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate image
-    embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by the mask
-    decoder to predict object masks.
+    SAMModel (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate
+    image embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by
+    the mask decoder to predict object masks.
     Attributes:
         mask_threshold (float): Threshold value for mask prediction.
-        image_format (str): Format of the input image, default is 'RGB'.
         image_encoder (ImageEncoderViT): The backbone used to encode the image into embeddings.
         prompt_encoder (PromptEncoder): Encodes various types of input prompts.
         mask_decoder (MaskDecoder): Predicts object masks from the image and prompt embeddings.
@@ -32,7 +31,6 @@ class Sam(nn.Module):
     """
     mask_threshold: float = 0.0
-    image_format: str = "RGB"
     def __init__(
         self,
@@ -43,7 +41,7 @@ class Sam(nn.Module):
         pixel_std: List[float] = (58.395, 57.12, 57.375),
     ) -> None:
         """
-        Initialize the Sam class to predict object masks from an image and input prompts.
+        Initialize the SAMModel class to predict object masks from an image and input prompts.
         Note:
             All forward() operations moved to SAMPredictor.

ultralytics/models/sam/modules/transformer.py CHANGED Viewed

@@ -86,7 +86,6 @@ class TwoWayTransformer(nn.Module):
           (torch.Tensor): the processed image_embedding
         """
         # BxCxHxW -> BxHWxC == B x N_image_tokens x C
-        bs, c, h, w = image_embedding.shape
         image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
         image_pe = image_pe.flatten(2).permute(0, 2, 1)
@@ -212,6 +211,7 @@ class Attention(nn.Module):
         embedding_dim: int,
         num_heads: int,
         downsample_rate: int = 1,
+        kv_in_dim: int = None,
     ) -> None:
         """
         Initializes the Attention model with the given dimensions and settings.
@@ -226,13 +226,14 @@ class Attention(nn.Module):
         """
         super().__init__()
         self.embedding_dim = embedding_dim
+        self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
         self.internal_dim = embedding_dim // downsample_rate
         self.num_heads = num_heads
         assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
         self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
-        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
-        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
         self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
     @staticmethod

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -168,7 +168,7 @@ class Predictor(BasePredictor):
                 - np.ndarray: An array of length C containing quality scores predicted by the model for each mask.
                 - np.ndarray: Low-resolution logits of shape CxHxW for subsequent inference, where H=W=256.
         """
-        features = self.model.image_encoder(im) if self.features is None else self.features
+        features = self.get_im_features(im) if self.features is None else self.features
         src_shape, dst_shape = self.batch[1][0].shape[:2], im.shape[2:]
         r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
@@ -334,7 +334,7 @@ class Predictor(BasePredictor):
         """
         device = select_device(self.args.device, verbose=verbose)
         if model is None:
-            model = build_sam(self.args.model)
+            model = self.get_model()
         model.eval()
         self.model = model.to(device)
         self.device = device
@@ -348,6 +348,10 @@ class Predictor(BasePredictor):
         self.model.fp16 = False
         self.done_warmup = True
+    def get_model(self):
+        """Built Segment Anything Model (SAM) model."""
+        return build_sam(self.args.model)
     def postprocess(self, preds, img, orig_imgs):
         """
         Post-processes SAM's inference outputs to generate object detection masks and bounding boxes.
@@ -412,16 +416,18 @@ class Predictor(BasePredictor):
             AssertionError: If more than one image is set.
         """
         if self.model is None:
-            model = build_sam(self.args.model)
-            self.setup_model(model)
+            self.setup_model(model=None)
         self.setup_source(image)
         assert len(self.dataset) == 1, "`set_image` only supports setting one image!"
         for batch in self.dataset:
             im = self.preprocess(batch[1])
-            self.features = self.model.image_encoder(im)
-            self.im = im
+            self.features = self.get_im_features(im)
             break
+    def get_im_features(self, im):
+        """Get image features from the SAM image encoder."""
+        return self.model.image_encoder(im)
     def set_prompts(self, prompts):
         """Set prompts in advance."""
         self.prompts = prompts

ultralytics/models/sam2/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+from .model import SAM2
+from .predict import SAM2Predictor
+__all__ = "SAM2", "SAM2Predictor"  # tuple or list

ultralytics 8.2.68__py3-none-any.whl → 8.2.70__py3-none-any.whl

Potentially problematic release.

ultralytics 8.2.68py3-none-any.whl → 8.2.70py3-none-any.whl