PyPI - ultralytics - Versions diffs - 8.2.69__tar.gz → 8.2.71__tar.gz - Mend

ultralytics 8.2.69tar.gz → 8.2.71tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultralytics might be problematic. Click here for more details.

Files changed (246) hide show

{ultralytics-8.2.69/ultralytics.egg-info → ultralytics-8.2.71}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ultralytics
-Version: 8.2.69
+Version: 8.2.71
 Summary: Ultralytics YOLOv8 for SOTA object detection, multi-object tracking, instance segmentation, pose estimation and image classification.
 Author: Glenn Jocher, Ayush Chaurasia, Jing Qiu
 Maintainer: Glenn Jocher, Ayush Chaurasia, Jing Qiu

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
-__version__ = "8.2.69"
+__version__ = "8.2.71"
 import os
@@ -8,7 +8,7 @@ import os
 os.environ["OMP_NUM_THREADS"] = "1"  # reduce CPU utilization during training
 from ultralytics.data.explorer.explorer import Explorer
-from ultralytics.models import NAS, RTDETR, SAM, YOLO, FastSAM, YOLOWorld
+from ultralytics.models import NAS, RTDETR, SAM, SAM2, YOLO, FastSAM, YOLOWorld
 from ultralytics.utils import ASSETS, SETTINGS
 from ultralytics.utils.checks import check_yolo as checks
 from ultralytics.utils.downloads import download
@@ -21,6 +21,7 @@ __all__ = (
     "YOLOWorld",
     "NAS",
     "SAM",
+    "SAM2",
     "FastSAM",
     "RTDETR",
     "checks",

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/cfg/__init__.py RENAMED Viewed

@@ -793,6 +793,10 @@ def entrypoint(debug=""):
         from ultralytics import FastSAM
         model = FastSAM(model)
+    elif "sam2" in stem:
+        from ultralytics import SAM2
+        model = SAM2(model)
     elif "sam" in stem:
         from ultralytics import SAM

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/data/converter.py RENAMED Viewed

@@ -334,6 +334,87 @@ def convert_coco(
     LOGGER.info(f"{'LVIS' if lvis else 'COCO'} data converted successfully.\nResults saved to {save_dir.resolve()}")
+def convert_segment_masks_to_yolo_seg(masks_dir, output_dir, classes):
+    """
+    Converts a dataset of segmentation mask images to the YOLO segmentation format.
+    This function takes the directory containing the binary format mask images and converts them into YOLO segmentation format.
+    The converted masks are saved in the specified output directory.
+    Args:
+        masks_dir (str): The path to the directory where all mask images (png, jpg) are stored.
+        output_dir (str): The path to the directory where the converted YOLO segmentation masks will be stored.
+        classes (int): Total classes in the dataset i.e for COCO classes=80
+    Example:
+        ```python
+        from ultralytics.data.converter import convert_segment_masks_to_yolo_seg
+        # for coco dataset, we have 80 classes
+        convert_segment_masks_to_yolo_seg('path/to/masks_directory', 'path/to/output/directory', classes=80)
+        ```
+    Notes:
+        The expected directory structure for the masks is:
+            - masks
+                ├─ mask_image_01.png or mask_image_01.jpg
+                ├─ mask_image_02.png or mask_image_02.jpg
+                ├─ mask_image_03.png or mask_image_03.jpg
+                └─ mask_image_04.png or mask_image_04.jpg
+        After execution, the labels will be organized in the following structure:
+            - output_dir
+                ├─ mask_yolo_01.txt
+                ├─ mask_yolo_02.txt
+                ├─ mask_yolo_03.txt
+                └─ mask_yolo_04.txt
+    """
+    import os
+    pixel_to_class_mapping = {i + 1: i for i in range(80)}
+    for mask_filename in os.listdir(masks_dir):
+        if mask_filename.endswith(".png"):
+            mask_path = os.path.join(masks_dir, mask_filename)
+            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)  # Read the mask image in grayscale
+            img_height, img_width = mask.shape  # Get image dimensions
+            LOGGER.info(f"Processing {mask_path} imgsz = {img_height} x {img_width}")
+            unique_values = np.unique(mask)  # Get unique pixel values representing different classes
+            yolo_format_data = []
+            for value in unique_values:
+                if value == 0:
+                    continue  # Skip background
+                class_index = pixel_to_class_mapping.get(value, -1)
+                if class_index == -1:
+                    LOGGER.warning(f"Unknown class for pixel value {value} in file {mask_filename}, skipping.")
+                    continue
+                # Create a binary mask for the current class and find contours
+                contours, _ = cv2.findContours(
+                    (mask == value).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+                )  # Find contours
+                for contour in contours:
+                    if len(contour) >= 3:  # YOLO requires at least 3 points for a valid segmentation
+                        contour = contour.squeeze()  # Remove single-dimensional entries
+                        yolo_format = [class_index]
+                        for point in contour:
+                            # Normalize the coordinates
+                            yolo_format.append(round(point[0] / img_width, 6))  # Rounding to 6 decimal places
+                            yolo_format.append(round(point[1] / img_height, 6))
+                        yolo_format_data.append(yolo_format)
+            # Save Ultralytics YOLO format data to file
+            output_path = os.path.join(output_dir, os.path.splitext(mask_filename)[0] + ".txt")
+            with open(output_path, "w") as file:
+                for item in yolo_format_data:
+                    line = " ".join(map(str, item))
+                    file.write(line + "\n")
+            LOGGER.info(f"Processed and stored at {output_path} imgsz = {img_height} x {img_width}")
 def convert_dota_to_yolo_obb(dota_root_path: str):
     """
     Converts DOTA dataset annotations to YOLO OBB (Oriented Bounding Box) format.

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/engine/trainer.py RENAMED Viewed

@@ -26,6 +26,7 @@ from ultralytics.data.utils import check_cls_dataset, check_det_dataset
 from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
 from ultralytics.utils import (
     DEFAULT_CFG,
+    LOCAL_RANK,
     LOGGER,
     RANK,
     TQDM,
@@ -129,7 +130,7 @@ class BaseTrainer:
         # Model and Dataset
         self.model = check_model_file_from_stem(self.args.model)  # add suffix, i.e. yolov8n -> yolov8n.pt
-        with torch_distributed_zero_first(RANK):  # avoid auto-downloading dataset multiple times
+        with torch_distributed_zero_first(LOCAL_RANK):  # avoid auto-downloading dataset multiple times
             self.trainset, self.testset = self.get_dataset()
         self.ema = None
@@ -285,7 +286,7 @@ class BaseTrainer:
         # Dataloaders
         batch_size = self.batch_size // max(world_size, 1)
-        self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
+        self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=LOCAL_RANK, mode="train")
         if RANK in {-1, 0}:
             # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
             self.test_loader = self.get_dataloader(

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/engine/validator.py RENAMED Viewed

@@ -136,8 +136,8 @@ class BaseValidator:
             if engine:
                 self.args.batch = model.batch_size
             elif not pt and not jit:
-                self.args.batch = 1  # export.py models default to batch-size 1
-                LOGGER.info(f"Forcing batch=1 square inference (1,3,{imgsz},{imgsz}) for non-PyTorch models")
+                self.args.batch = model.metadata.get("batch", 1)  # export.py models default to batch-size 1
+                LOGGER.info(f"Setting batch={self.args.batch} input of shape ({self.args.batch}, 3, {imgsz}, {imgsz})")
             if str(self.args.data).split(".")[-1] in {"yaml", "yml"}:
                 self.data = check_det_dataset(self.args.data)

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ from .fastsam import FastSAM
 from .nas import NAS
 from .rtdetr import RTDETR
 from .sam import SAM
+from .sam2 import SAM2
 from .yolo import YOLO, YOLOWorld
-__all__ = "YOLO", "RTDETR", "SAM", "FastSAM", "NAS", "YOLOWorld"  # allow simpler import
+__all__ = "YOLO", "RTDETR", "SAM", "FastSAM", "NAS", "YOLOWorld", "SAM2"  # allow simpler import

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/fastsam/predict.py RENAMED Viewed

@@ -21,6 +21,7 @@ class FastSAMPredictor(SegmentationPredictor):
     """
     def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        """Initializes a FastSAMPredictor for fast SAM segmentation tasks in Ultralytics YOLO framework."""
         super().__init__(cfg, overrides, _callbacks)
         self.prompts = {}

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/build.py RENAMED Viewed

@@ -14,7 +14,7 @@ from ultralytics.utils.downloads import attempt_download_asset
 from .modules.decoders import MaskDecoder
 from .modules.encoders import ImageEncoderViT, PromptEncoder
-from .modules.sam import Sam
+from .modules.sam import SAMModel
 from .modules.tiny_encoder import TinyViT
 from .modules.transformer import TwoWayTransformer
@@ -105,7 +105,7 @@ def _build_sam(
             out_chans=prompt_embed_dim,
         )
     )
-    sam = Sam(
+    sam = SAMModel(
         image_encoder=image_encoder,
         prompt_encoder=PromptEncoder(
             embed_dim=prompt_embed_dim,

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/model.py RENAMED Viewed

@@ -44,6 +44,7 @@ class SAM(Model):
         """
         if model and Path(model).suffix not in {".pt", ".pth"}:
             raise NotImplementedError("SAM prediction requires pre-trained *.pt or *.pth model.")
+        self.is_sam2 = "sam2" in Path(model).stem
         super().__init__(model=model, task="segment")
     def _load(self, weights: str, task=None):
@@ -54,7 +55,12 @@ class SAM(Model):
             weights (str): Path to the weights file.
             task (str, optional): Task name. Defaults to None.
         """
-        self.model = build_sam(weights)
+        if self.is_sam2:
+            from ..sam2.build import build_sam2
+            self.model = build_sam2(weights)
+        else:
+            self.model = build_sam(weights)
     def predict(self, source, stream=False, bboxes=None, points=None, labels=None, **kwargs):
         """
@@ -112,4 +118,6 @@ class SAM(Model):
         Returns:
             (dict): A dictionary mapping the 'segment' task to its corresponding 'Predictor'.
         """
-        return {"segment": {"predictor": Predictor}}
+        from ..sam2.predict import SAM2Predictor
+        return {"segment": {"predictor": SAM2Predictor if self.is_sam2 else Predictor}}

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/modules/decoders.py RENAMED Viewed

@@ -4,9 +4,8 @@ from typing import List, Tuple, Type
 import torch
 from torch import nn
-from torch.nn import functional as F
-from ultralytics.nn.modules import LayerNorm2d
+from ultralytics.nn.modules import MLP, LayerNorm2d
 class MaskDecoder(nn.Module):
@@ -28,7 +27,6 @@ class MaskDecoder(nn.Module):
     def __init__(
         self,
-        *,
         transformer_dim: int,
         transformer: nn.Module,
         num_multimask_outputs: int = 3,
@@ -149,42 +147,3 @@ class MaskDecoder(nn.Module):
         iou_pred = self.iou_prediction_head(iou_token_out)
         return masks, iou_pred
-class MLP(nn.Module):
-    """
-    MLP (Multi-Layer Perceptron) model lightly adapted from
-    https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py
-    """
-    def __init__(
-        self,
-        input_dim: int,
-        hidden_dim: int,
-        output_dim: int,
-        num_layers: int,
-        sigmoid_output: bool = False,
-    ) -> None:
-        """
-        Initializes the MLP (Multi-Layer Perceptron) model.
-        Args:
-            input_dim (int): The dimensionality of the input features.
-            hidden_dim (int): The dimensionality of the hidden layers.
-            output_dim (int): The dimensionality of the output layer.
-            num_layers (int): The number of hidden layers.
-            sigmoid_output (bool, optional): Apply a sigmoid activation to the output layer. Defaults to False.
-        """
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-        self.sigmoid_output = sigmoid_output
-    def forward(self, x):
-        """Executes feedforward within the neural network module and applies activation."""
-        for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        if self.sigmoid_output:
-            x = torch.sigmoid(x)
-        return x

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/modules/encoders.py RENAMED Viewed

@@ -211,6 +211,8 @@ class PromptEncoder(nn.Module):
         point_embedding[labels == -1] += self.not_a_point_embed.weight
         point_embedding[labels == 0] += self.point_embeddings[0].weight
         point_embedding[labels == 1] += self.point_embeddings[1].weight
+        point_embedding[labels == 2] += self.point_embeddings[2].weight
+        point_embedding[labels == 3] += self.point_embeddings[3].weight
         return point_embedding
     def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
@@ -226,8 +228,8 @@ class PromptEncoder(nn.Module):
         """Embeds mask inputs."""
         return self.mask_downscaling(masks)
+    @staticmethod
     def _get_batch_size(
-        self,
         points: Optional[Tuple[torch.Tensor, torch.Tensor]],
         boxes: Optional[torch.Tensor],
         masks: Optional[torch.Tensor],

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/modules/sam.py RENAMED Viewed

@@ -15,15 +15,14 @@ from .decoders import MaskDecoder
 from .encoders import ImageEncoderViT, PromptEncoder
-class Sam(nn.Module):
+class SAMModel(nn.Module):
     """
-    Sam (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate image
-    embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by the mask
-    decoder to predict object masks.
+    SAMModel (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate
+    image embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by
+    the mask decoder to predict object masks.
     Attributes:
         mask_threshold (float): Threshold value for mask prediction.
-        image_format (str): Format of the input image, default is 'RGB'.
         image_encoder (ImageEncoderViT): The backbone used to encode the image into embeddings.
         prompt_encoder (PromptEncoder): Encodes various types of input prompts.
         mask_decoder (MaskDecoder): Predicts object masks from the image and prompt embeddings.
@@ -32,7 +31,6 @@ class Sam(nn.Module):
     """
     mask_threshold: float = 0.0
-    image_format: str = "RGB"
     def __init__(
         self,
@@ -43,7 +41,7 @@ class Sam(nn.Module):
         pixel_std: List[float] = (58.395, 57.12, 57.375),
     ) -> None:
         """
-        Initialize the Sam class to predict object masks from an image and input prompts.
+        Initialize the SAMModel class to predict object masks from an image and input prompts.
         Note:
             All forward() operations moved to SAMPredictor.

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/modules/transformer.py RENAMED Viewed

@@ -86,7 +86,6 @@ class TwoWayTransformer(nn.Module):
           (torch.Tensor): the processed image_embedding
         """
         # BxCxHxW -> BxHWxC == B x N_image_tokens x C
-        bs, c, h, w = image_embedding.shape
         image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
         image_pe = image_pe.flatten(2).permute(0, 2, 1)
@@ -212,6 +211,7 @@ class Attention(nn.Module):
         embedding_dim: int,
         num_heads: int,
         downsample_rate: int = 1,
+        kv_in_dim: int = None,
     ) -> None:
         """
         Initializes the Attention model with the given dimensions and settings.
@@ -226,13 +226,14 @@ class Attention(nn.Module):
         """
         super().__init__()
         self.embedding_dim = embedding_dim
+        self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
         self.internal_dim = embedding_dim // downsample_rate
         self.num_heads = num_heads
         assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
         self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
-        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
-        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
         self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
     @staticmethod

{ultralytics-8.2.69 → ultralytics-8.2.71}/ultralytics/models/sam/predict.py RENAMED Viewed

@@ -168,7 +168,7 @@ class Predictor(BasePredictor):
                 - np.ndarray: An array of length C containing quality scores predicted by the model for each mask.
                 - np.ndarray: Low-resolution logits of shape CxHxW for subsequent inference, where H=W=256.
         """
-        features = self.model.image_encoder(im) if self.features is None else self.features
+        features = self.get_im_features(im) if self.features is None else self.features
         src_shape, dst_shape = self.batch[1][0].shape[:2], im.shape[2:]
         r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
@@ -334,7 +334,7 @@ class Predictor(BasePredictor):
         """
         device = select_device(self.args.device, verbose=verbose)
         if model is None:
-            model = build_sam(self.args.model)
+            model = self.get_model()
         model.eval()
         self.model = model.to(device)
         self.device = device
@@ -348,6 +348,10 @@ class Predictor(BasePredictor):
         self.model.fp16 = False
         self.done_warmup = True
+    def get_model(self):
+        """Built Segment Anything Model (SAM) model."""
+        return build_sam(self.args.model)
     def postprocess(self, preds, img, orig_imgs):
         """
         Post-processes SAM's inference outputs to generate object detection masks and bounding boxes.
@@ -412,16 +416,18 @@ class Predictor(BasePredictor):
             AssertionError: If more than one image is set.
         """
         if self.model is None:
-            model = build_sam(self.args.model)
-            self.setup_model(model)
+            self.setup_model(model=None)
         self.setup_source(image)
         assert len(self.dataset) == 1, "`set_image` only supports setting one image!"
         for batch in self.dataset:
             im = self.preprocess(batch[1])
-            self.features = self.model.image_encoder(im)
-            self.im = im
+            self.features = self.get_im_features(im)
             break
+    def get_im_features(self, im):
+        """Get image features from the SAM image encoder."""
+        return self.model.image_encoder(im)
     def set_prompts(self, prompts):
         """Set prompts in advance."""
         self.prompts = prompts

ultralytics-8.2.71/ultralytics/models/sam2/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+from .model import SAM2
+from .predict import SAM2Predictor
+__all__ = "SAM2", "SAM2Predictor"  # tuple or list

ultralytics-8.2.71/ultralytics/models/sam2/build.py ADDED Viewed

@@ -0,0 +1,156 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+import torch
+from ultralytics.utils.downloads import attempt_download_asset
+from .modules.encoders import FpnNeck, Hiera, ImageEncoder, MemoryEncoder
+from .modules.memory_attention import MemoryAttention, MemoryAttentionLayer
+from .modules.sam2 import SAM2Model
+def build_sam2_t(checkpoint=None):
+    """Build and return a Segment Anything Model (SAM2) tiny-size model with specified architecture parameters."""
+    return _build_sam2(
+        encoder_embed_dim=96,
+        encoder_stages=[1, 2, 7, 2],
+        encoder_num_heads=1,
+        encoder_global_att_blocks=[5, 7, 9],
+        encoder_window_spec=[8, 4, 14, 7],
+        encoder_backbone_channel_list=[768, 384, 192, 96],
+        checkpoint=checkpoint,
+    )
+def build_sam2_s(checkpoint=None):
+    """Builds and returns a small-size Segment Anything Model (SAM2) with specified architecture parameters."""
+    return _build_sam2(
+        encoder_embed_dim=96,
+        encoder_stages=[1, 2, 11, 2],
+        encoder_num_heads=1,
+        encoder_global_att_blocks=[7, 10, 13],
+        encoder_window_spec=[8, 4, 14, 7],
+        encoder_backbone_channel_list=[768, 384, 192, 96],
+        checkpoint=checkpoint,
+    )
+def build_sam2_b(checkpoint=None):
+    """Builds and returns a Segment Anything Model (SAM2) base-size model with specified architecture parameters."""
+    return _build_sam2(
+        encoder_embed_dim=112,
+        encoder_stages=[2, 3, 16, 3],
+        encoder_num_heads=2,
+        encoder_global_att_blocks=[12, 16, 20],
+        encoder_window_spec=[8, 4, 14, 7],
+        encoder_window_spatial_size=[14, 14],
+        encoder_backbone_channel_list=[896, 448, 224, 112],
+        checkpoint=checkpoint,
+    )
+def build_sam2_l(checkpoint=None):
+    """Build and return a Segment Anything Model (SAM2) large-size model with specified architecture parameters."""
+    return _build_sam2(
+        encoder_embed_dim=144,
+        encoder_stages=[2, 6, 36, 4],
+        encoder_num_heads=2,
+        encoder_global_att_blocks=[23, 33, 43],
+        encoder_window_spec=[8, 4, 16, 8],
+        encoder_backbone_channel_list=[1152, 576, 288, 144],
+        checkpoint=checkpoint,
+    )
+def _build_sam2(
+    encoder_embed_dim=1280,
+    encoder_stages=[2, 6, 36, 4],
+    encoder_num_heads=2,
+    encoder_global_att_blocks=[7, 15, 23, 31],
+    encoder_backbone_channel_list=[1152, 576, 288, 144],
+    encoder_window_spatial_size=[7, 7],
+    encoder_window_spec=[8, 4, 16, 8],
+    checkpoint=None,
+):
+    """Builds a SAM2 model with specified architecture parameters and optional checkpoint loading."""
+    image_encoder = ImageEncoder(
+        trunk=Hiera(
+            embed_dim=encoder_embed_dim,
+            num_heads=encoder_num_heads,
+            stages=encoder_stages,
+            global_att_blocks=encoder_global_att_blocks,
+            window_pos_embed_bkg_spatial_size=encoder_window_spatial_size,
+            window_spec=encoder_window_spec,
+        ),
+        neck=FpnNeck(
+            d_model=256,
+            backbone_channel_list=encoder_backbone_channel_list,
+            fpn_top_down_levels=[2, 3],
+            fpn_interp_model="nearest",
+        ),
+        scalp=1,
+    )
+    memory_attention = MemoryAttention(d_model=256, pos_enc_at_input=True, num_layers=4, layer=MemoryAttentionLayer())
+    memory_encoder = MemoryEncoder(out_dim=64)
+    sam2 = SAM2Model(
+        image_encoder=image_encoder,
+        memory_attention=memory_attention,
+        memory_encoder=memory_encoder,
+        num_maskmem=7,
+        image_size=1024,
+        sigmoid_scale_for_mem_enc=20.0,
+        sigmoid_bias_for_mem_enc=-10.0,
+        use_mask_input_as_output_without_sam=True,
+        directly_add_no_mem_embed=True,
+        use_high_res_features_in_sam=True,
+        multimask_output_in_sam=True,
+        iou_prediction_use_sigmoid=True,
+        use_obj_ptrs_in_encoder=True,
+        add_tpos_enc_to_obj_ptrs=True,
+        only_obj_ptrs_in_the_past_for_eval=True,
+        pred_obj_scores=True,
+        pred_obj_scores_mlp=True,
+        fixed_no_obj_ptr=True,
+        multimask_output_for_tracking=True,
+        use_multimask_token_for_obj_ptr=True,
+        multimask_min_pt_num=0,
+        multimask_max_pt_num=1,
+        use_mlp_for_obj_ptr_proj=True,
+        compile_image_encoder=False,
+        sam_mask_decoder_extra_args=dict(
+            dynamic_multimask_via_stability=True,
+            dynamic_multimask_stability_delta=0.05,
+            dynamic_multimask_stability_thresh=0.98,
+        ),
+    )
+    if checkpoint is not None:
+        checkpoint = attempt_download_asset(checkpoint)
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f)["model"]
+        sam2.load_state_dict(state_dict)
+    sam2.eval()
+    return sam2
+sam_model_map = {
+    "sam2_t.pt": build_sam2_t,
+    "sam2_s.pt": build_sam2_s,
+    "sam2_b.pt": build_sam2_b,
+    "sam2_l.pt": build_sam2_l,
+}
+def build_sam2(ckpt="sam_b.pt"):
+    """Constructs a Segment Anything Model (SAM2) based on the specified checkpoint, with various size options."""
+    model_builder = None
+    ckpt = str(ckpt)  # to allow Path ckpt types
+    for k in sam_model_map.keys():
+        if ckpt.endswith(k):
+            model_builder = sam_model_map.get(k)
+    if not model_builder:
+        raise FileNotFoundError(f"{ckpt} is not a supported SAM model. Available models are: \n {sam_model_map.keys()}")
+    return model_builder(ckpt)

ultralytics 8.2.69__tar.gz → 8.2.71__tar.gz

Potentially problematic release.

ultralytics 8.2.69tar.gz → 8.2.71tar.gz