PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.214__py3-none-any.whl → 8.4.7__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.214py3-none-any.whl → 8.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (249) hide show

{dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/METADATA +64 -74
dgenerate_ultralytics_headless-8.4.7.dist-info/RECORD +311 -0
{dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/WHEEL +1 -1
tests/__init__.py +7 -9
tests/conftest.py +8 -15
tests/test_cli.py +1 -1
tests/test_cuda.py +13 -10
tests/test_engine.py +9 -9
tests/test_exports.py +65 -13
tests/test_integrations.py +13 -13
tests/test_python.py +125 -69
tests/test_solutions.py +161 -152
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +86 -92
ultralytics/cfg/datasets/Argoverse.yaml +7 -6
ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
ultralytics/cfg/datasets/ImageNet.yaml +1 -1
ultralytics/cfg/datasets/TT100K.yaml +346 -0
ultralytics/cfg/datasets/VOC.yaml +15 -16
ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
ultralytics/cfg/datasets/coco-pose.yaml +21 -0
ultralytics/cfg/datasets/coco12-formats.yaml +101 -0
ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
ultralytics/cfg/datasets/coco8-pose.yaml +21 -0
ultralytics/cfg/datasets/dog-pose.yaml +28 -0
ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
ultralytics/cfg/datasets/dota8.yaml +2 -2
ultralytics/cfg/datasets/hand-keypoints.yaml +26 -2
ultralytics/cfg/datasets/kitti.yaml +27 -0
ultralytics/cfg/datasets/lvis.yaml +5 -5
ultralytics/cfg/datasets/open-images-v7.yaml +1 -1
ultralytics/cfg/datasets/tiger-pose.yaml +16 -0
ultralytics/cfg/datasets/xView.yaml +16 -16
ultralytics/cfg/default.yaml +4 -2
ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
ultralytics/cfg/models/26/yolo26-cls.yaml +33 -0
ultralytics/cfg/models/26/yolo26-obb.yaml +52 -0
ultralytics/cfg/models/26/yolo26-p2.yaml +60 -0
ultralytics/cfg/models/26/yolo26-p6.yaml +62 -0
ultralytics/cfg/models/26/yolo26-pose.yaml +53 -0
ultralytics/cfg/models/26/yolo26-seg.yaml +52 -0
ultralytics/cfg/models/26/yolo26.yaml +52 -0
ultralytics/cfg/models/26/yoloe-26-seg.yaml +53 -0
ultralytics/cfg/models/26/yoloe-26.yaml +53 -0
ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +1 -1
ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml +1 -1
ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml +1 -1
ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +1 -1
ultralytics/cfg/models/v10/yolov10b.yaml +2 -2
ultralytics/cfg/models/v10/yolov10l.yaml +2 -2
ultralytics/cfg/models/v10/yolov10m.yaml +2 -2
ultralytics/cfg/models/v10/yolov10n.yaml +2 -2
ultralytics/cfg/models/v10/yolov10s.yaml +2 -2
ultralytics/cfg/models/v10/yolov10x.yaml +2 -2
ultralytics/cfg/models/v3/yolov3-tiny.yaml +1 -1
ultralytics/cfg/models/v6/yolov6.yaml +1 -1
ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-seg-p6.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
ultralytics/cfg/models/v9/yolov9s.yaml +1 -1
ultralytics/data/__init__.py +4 -4
ultralytics/data/annotator.py +5 -6
ultralytics/data/augment.py +300 -475
ultralytics/data/base.py +18 -26
ultralytics/data/build.py +147 -25
ultralytics/data/converter.py +108 -87
ultralytics/data/dataset.py +47 -75
ultralytics/data/loaders.py +42 -49
ultralytics/data/split.py +5 -6
ultralytics/data/split_dota.py +8 -15
ultralytics/data/utils.py +36 -45
ultralytics/engine/exporter.py +351 -263
ultralytics/engine/model.py +186 -225
ultralytics/engine/predictor.py +45 -54
ultralytics/engine/results.py +198 -325
ultralytics/engine/trainer.py +165 -106
ultralytics/engine/tuner.py +41 -43
ultralytics/engine/validator.py +55 -38
ultralytics/hub/__init__.py +16 -19
ultralytics/hub/auth.py +6 -12
ultralytics/hub/google/__init__.py +7 -10
ultralytics/hub/session.py +15 -25
ultralytics/hub/utils.py +5 -8
ultralytics/models/__init__.py +1 -1
ultralytics/models/fastsam/__init__.py +1 -1
ultralytics/models/fastsam/model.py +8 -10
ultralytics/models/fastsam/predict.py +18 -30
ultralytics/models/fastsam/utils.py +1 -2
ultralytics/models/fastsam/val.py +5 -7
ultralytics/models/nas/__init__.py +1 -1
ultralytics/models/nas/model.py +5 -8
ultralytics/models/nas/predict.py +7 -9
ultralytics/models/nas/val.py +1 -2
ultralytics/models/rtdetr/__init__.py +1 -1
ultralytics/models/rtdetr/model.py +5 -8
ultralytics/models/rtdetr/predict.py +15 -19
ultralytics/models/rtdetr/train.py +10 -13
ultralytics/models/rtdetr/val.py +21 -23
ultralytics/models/sam/__init__.py +15 -2
ultralytics/models/sam/amg.py +14 -20
ultralytics/models/sam/build.py +26 -19
ultralytics/models/sam/build_sam3.py +377 -0
ultralytics/models/sam/model.py +29 -32
ultralytics/models/sam/modules/blocks.py +83 -144
ultralytics/models/sam/modules/decoders.py +19 -37
ultralytics/models/sam/modules/encoders.py +44 -101
ultralytics/models/sam/modules/memory_attention.py +16 -30
ultralytics/models/sam/modules/sam.py +200 -73
ultralytics/models/sam/modules/tiny_encoder.py +64 -83
ultralytics/models/sam/modules/transformer.py +18 -28
ultralytics/models/sam/modules/utils.py +174 -50
ultralytics/models/sam/predict.py +2248 -350
ultralytics/models/sam/sam3/__init__.py +3 -0
ultralytics/models/sam/sam3/decoder.py +546 -0
ultralytics/models/sam/sam3/encoder.py +529 -0
ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
ultralytics/models/sam/sam3/model_misc.py +199 -0
ultralytics/models/sam/sam3/necks.py +129 -0
ultralytics/models/sam/sam3/sam3_image.py +339 -0
ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
ultralytics/models/sam/sam3/vitdet.py +547 -0
ultralytics/models/sam/sam3/vl_combiner.py +160 -0
ultralytics/models/utils/loss.py +14 -26
ultralytics/models/utils/ops.py +13 -17
ultralytics/models/yolo/__init__.py +1 -1
ultralytics/models/yolo/classify/predict.py +10 -13
ultralytics/models/yolo/classify/train.py +12 -33
ultralytics/models/yolo/classify/val.py +30 -29
ultralytics/models/yolo/detect/predict.py +9 -12
ultralytics/models/yolo/detect/train.py +17 -23
ultralytics/models/yolo/detect/val.py +77 -59
ultralytics/models/yolo/model.py +43 -60
ultralytics/models/yolo/obb/predict.py +7 -16
ultralytics/models/yolo/obb/train.py +14 -17
ultralytics/models/yolo/obb/val.py +40 -37
ultralytics/models/yolo/pose/__init__.py +1 -1
ultralytics/models/yolo/pose/predict.py +7 -22
ultralytics/models/yolo/pose/train.py +13 -16
ultralytics/models/yolo/pose/val.py +39 -58
ultralytics/models/yolo/segment/predict.py +17 -21
ultralytics/models/yolo/segment/train.py +7 -10
ultralytics/models/yolo/segment/val.py +95 -47
ultralytics/models/yolo/world/train.py +8 -14
ultralytics/models/yolo/world/train_world.py +11 -34
ultralytics/models/yolo/yoloe/__init__.py +7 -7
ultralytics/models/yolo/yoloe/predict.py +16 -23
ultralytics/models/yolo/yoloe/train.py +36 -44
ultralytics/models/yolo/yoloe/train_seg.py +11 -11
ultralytics/models/yolo/yoloe/val.py +15 -20
ultralytics/nn/__init__.py +7 -7
ultralytics/nn/autobackend.py +159 -85
ultralytics/nn/modules/__init__.py +68 -60
ultralytics/nn/modules/activation.py +4 -6
ultralytics/nn/modules/block.py +260 -224
ultralytics/nn/modules/conv.py +52 -97
ultralytics/nn/modules/head.py +831 -299
ultralytics/nn/modules/transformer.py +76 -88
ultralytics/nn/modules/utils.py +16 -21
ultralytics/nn/tasks.py +180 -195
ultralytics/nn/text_model.py +45 -69
ultralytics/optim/__init__.py +5 -0
ultralytics/optim/muon.py +338 -0
ultralytics/solutions/__init__.py +12 -12
ultralytics/solutions/ai_gym.py +13 -19
ultralytics/solutions/analytics.py +15 -16
ultralytics/solutions/config.py +6 -7
ultralytics/solutions/distance_calculation.py +10 -13
ultralytics/solutions/heatmap.py +8 -14
ultralytics/solutions/instance_segmentation.py +6 -9
ultralytics/solutions/object_blurrer.py +7 -10
ultralytics/solutions/object_counter.py +12 -19
ultralytics/solutions/object_cropper.py +8 -14
ultralytics/solutions/parking_management.py +34 -32
ultralytics/solutions/queue_management.py +10 -12
ultralytics/solutions/region_counter.py +9 -12
ultralytics/solutions/security_alarm.py +15 -20
ultralytics/solutions/similarity_search.py +10 -15
ultralytics/solutions/solutions.py +77 -76
ultralytics/solutions/speed_estimation.py +7 -10
ultralytics/solutions/streamlit_inference.py +2 -4
ultralytics/solutions/templates/similarity-search.html +7 -18
ultralytics/solutions/trackzone.py +7 -10
ultralytics/solutions/vision_eye.py +5 -8
ultralytics/trackers/__init__.py +1 -1
ultralytics/trackers/basetrack.py +3 -5
ultralytics/trackers/bot_sort.py +10 -27
ultralytics/trackers/byte_tracker.py +21 -37
ultralytics/trackers/track.py +4 -7
ultralytics/trackers/utils/gmc.py +11 -22
ultralytics/trackers/utils/kalman_filter.py +37 -48
ultralytics/trackers/utils/matching.py +12 -15
ultralytics/utils/__init__.py +124 -124
ultralytics/utils/autobatch.py +2 -4
ultralytics/utils/autodevice.py +17 -18
ultralytics/utils/benchmarks.py +57 -71
ultralytics/utils/callbacks/base.py +8 -10
ultralytics/utils/callbacks/clearml.py +5 -13
ultralytics/utils/callbacks/comet.py +32 -46
ultralytics/utils/callbacks/dvc.py +13 -18
ultralytics/utils/callbacks/mlflow.py +4 -5
ultralytics/utils/callbacks/neptune.py +7 -15
ultralytics/utils/callbacks/platform.py +423 -38
ultralytics/utils/callbacks/raytune.py +3 -4
ultralytics/utils/callbacks/tensorboard.py +25 -31
ultralytics/utils/callbacks/wb.py +16 -14
ultralytics/utils/checks.py +127 -85
ultralytics/utils/cpu.py +3 -8
ultralytics/utils/dist.py +9 -12
ultralytics/utils/downloads.py +25 -33
ultralytics/utils/errors.py +6 -14
ultralytics/utils/events.py +2 -4
ultralytics/utils/export/__init__.py +4 -236
ultralytics/utils/export/engine.py +246 -0
ultralytics/utils/export/imx.py +117 -63
ultralytics/utils/export/tensorflow.py +231 -0
ultralytics/utils/files.py +26 -30
ultralytics/utils/git.py +9 -11
ultralytics/utils/instance.py +30 -51
ultralytics/utils/logger.py +212 -114
ultralytics/utils/loss.py +601 -215
ultralytics/utils/metrics.py +128 -156
ultralytics/utils/nms.py +13 -16
ultralytics/utils/ops.py +117 -166
ultralytics/utils/patches.py +75 -21
ultralytics/utils/plotting.py +75 -80
ultralytics/utils/tal.py +125 -59
ultralytics/utils/torch_utils.py +53 -79
ultralytics/utils/tqdm.py +24 -21
ultralytics/utils/triton.py +13 -19
ultralytics/utils/tuner.py +19 -10
dgenerate_ultralytics_headless-8.3.214.dist-info/RECORD +0 -283
{dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.214.dist-info → dgenerate_ultralytics_headless-8.4.7.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/encoders.py CHANGED Viewed

@@ -21,8 +21,7 @@ from .blocks import (
 class ImageEncoderViT(nn.Module):
-    """
-    An image encoder using Vision Transformer (ViT) architecture for encoding images into a compact latent space.
+    """An image encoder using Vision Transformer (ViT) architecture for encoding images into a compact latent space.
     This class processes images by splitting them into patches, applying transformer blocks, and generating a final
     encoded representation through a neck module.
@@ -64,8 +63,7 @@ class ImageEncoderViT(nn.Module):
         window_size: int = 0,
         global_attn_indexes: tuple[int, ...] = (),
     ) -> None:
-        """
-        Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
+        """Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
         Args:
             img_size (int): Input image size, assumed to be square.
@@ -84,12 +82,6 @@ class ImageEncoderViT(nn.Module):
             rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
             window_size (int): Size of attention window for windowed attention blocks.
             global_attn_indexes (tuple[int, ...]): Indices of blocks that use global attention.
-        Examples:
-            >>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
-            >>> input_image = torch.randn(1, 3, 224, 224)
-            >>> output = encoder(input_image)
-            >>> print(output.shape)
         """
         super().__init__()
         self.img_size = img_size
@@ -156,8 +148,7 @@ class ImageEncoderViT(nn.Module):
 class PromptEncoder(nn.Module):
-    """
-    Encode different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
+    """Encode different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
     Attributes:
         embed_dim (int): Dimension of the embeddings.
@@ -193,8 +184,7 @@ class PromptEncoder(nn.Module):
         mask_in_chans: int,
         activation: type[nn.Module] = nn.GELU,
     ) -> None:
-        """
-        Initialize the PromptEncoder module for encoding various types of prompts.
+        """Initialize the PromptEncoder module for encoding various types of prompts.
         Args:
             embed_dim (int): The dimension of the embeddings.
@@ -202,15 +192,6 @@ class PromptEncoder(nn.Module):
             input_image_size (tuple[int, int]): The padded size of the input image as (H, W).
             mask_in_chans (int): The number of hidden channels used for encoding input masks.
             activation (Type[nn.Module]): The activation function to use when encoding input masks.
-        Examples:
-            >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
-            >>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
-            >>> boxes = torch.rand(1, 2, 2)
-            >>> masks = torch.rand(1, 1, 256, 256)
-            >>> sparse_embeddings, dense_embeddings = prompt_encoder(points, boxes, masks)
-            >>> print(sparse_embeddings.shape, dense_embeddings.shape)
-            torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
         """
         super().__init__()
         self.embed_dim = embed_dim
@@ -236,15 +217,14 @@ class PromptEncoder(nn.Module):
         self.no_mask_embed = nn.Embedding(1, embed_dim)
     def get_dense_pe(self) -> torch.Tensor:
-        """
-        Return the dense positional encoding used for encoding point prompts.
+        """Return the dense positional encoding used for encoding point prompts.
         Generate a positional encoding for a dense set of points matching the shape of the image
         encoding. The encoding is used to provide spatial information to the model when processing point prompts.
         Returns:
-            (torch.Tensor): Positional encoding tensor with shape (1, embed_dim, H, W), where H and W are the
-                height and width of the image embedding size, respectively.
+            (torch.Tensor): Positional encoding tensor with shape (1, embed_dim, H, W), where H and W are the height and
+                width of the image embedding size, respectively.
         Examples:
             >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
@@ -306,13 +286,11 @@ class PromptEncoder(nn.Module):
         boxes: torch.Tensor | None,
         masks: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Embed different types of prompts, returning both sparse and dense embeddings.
+        """Embed different types of prompts, returning both sparse and dense embeddings.
         Args:
-            points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
-                tensor contains coordinates with shape (B, N, 2), and the second tensor contains labels with
-                shape (B, N).
+            points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first tensor
+                contains coordinates of shape (B, N, 2), and the second tensor contains labels of shape (B, N).
             boxes (torch.Tensor | None): Boxes to embed with shape (B, M, 2, 2), where M is the number of boxes.
             masks (torch.Tensor | None): Masks to embed with shape (B, 1, H, W).
@@ -354,11 +332,10 @@ class PromptEncoder(nn.Module):
 class MemoryEncoder(nn.Module):
-    """
-    Encode pixel features and masks into a memory representation for efficient image segmentation.
+    """Encode pixel features and masks into a memory representation for efficient image segmentation.
-    This class processes pixel-level features and masks, fusing them to generate encoded memory representations
-    suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
+    This class processes pixel-level features and masks, fusing them to generate encoded memory representations suitable
+    for downstream tasks in image segmentation models like SAM (Segment Anything Model).
     Attributes:
         mask_downsampler (MaskDownSampler): Module for downsampling input masks.
@@ -384,9 +361,9 @@ class MemoryEncoder(nn.Module):
         self,
         out_dim,
         in_dim=256,  # in_dim of pix_feats
+        interpol_size: tuple[int, int] | None = None,
     ):
-        """
-        Initialize the MemoryEncoder for encoding pixel features and masks into memory representations.
+        """Initialize the MemoryEncoder for encoding pixel features and masks into memory representations.
         This encoder processes pixel-level features and masks, fusing them to generate encoded memory representations
         suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
@@ -394,18 +371,12 @@ class MemoryEncoder(nn.Module):
         Args:
             out_dim (int): Output dimension of the encoded features.
             in_dim (int): Input dimension of the pixel features.
-        Examples:
-            >>> encoder = MemoryEncoder(out_dim=256, in_dim=256)
-            >>> pix_feat = torch.randn(1, 256, 64, 64)
-            >>> masks = torch.randn(1, 1, 64, 64)
-            >>> encoded_feat, pos = encoder(pix_feat, masks)
-            >>> print(encoded_feat.shape, pos.shape)
-            torch.Size([1, 256, 64, 64]) torch.Size([1, 128, 64, 64])
+            interpol_size (tuple[int, int] | None): Size to interpolate masks to. If None, uses the size of pixel
+                features.
         """
         super().__init__()
-        self.mask_downsampler = MaskDownSampler(kernel_size=3, stride=2, padding=1)
+        self.mask_downsampler = MaskDownSampler(kernel_size=3, stride=2, padding=1, interpol_size=interpol_size)
         self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
         self.fuser = Fuser(CXBlock(dim=256), num_layers=2)
@@ -439,11 +410,10 @@ class MemoryEncoder(nn.Module):
 class ImageEncoder(nn.Module):
-    """
-    Encode images using a trunk-neck architecture, producing multiscale features and positional encodings.
+    """Encode images using a trunk-neck architecture, producing multiscale features and positional encodings.
-    This class combines a trunk network for feature extraction with a neck network for feature refinement
-    and positional encoding generation. It can optionally discard the lowest resolution features.
+    This class combines a trunk network for feature extraction with a neck network for feature refinement and positional
+    encoding generation. It can optionally discard the lowest resolution features.
     Attributes:
         trunk (nn.Module): The trunk network for initial feature extraction.
@@ -469,25 +439,15 @@ class ImageEncoder(nn.Module):
         neck: nn.Module,
         scalp: int = 0,
     ):
-        """
-        Initialize the ImageEncoder with trunk and neck networks for feature extraction and refinement.
+        """Initialize the ImageEncoder with trunk and neck networks for feature extraction and refinement.
-        This encoder combines a trunk network for feature extraction with a neck network for feature refinement
-        and positional encoding generation. It can optionally discard the lowest resolution features.
+        This encoder combines a trunk network for feature extraction with a neck network for feature refinement and
+        positional encoding generation. It can optionally discard the lowest resolution features.
         Args:
             trunk (nn.Module): The trunk network for initial feature extraction.
             neck (nn.Module): The neck network for feature refinement and positional encoding generation.
             scalp (int): Number of lowest resolution feature levels to discard.
-        Examples:
-            >>> trunk = SomeTrunkNetwork()
-            >>> neck = SomeNeckNetwork()
-            >>> encoder = ImageEncoder(trunk, neck, scalp=1)
-            >>> image = torch.randn(1, 3, 224, 224)
-            >>> output = encoder(image)
-            >>> print(output.keys())
-            dict_keys(['vision_features', 'vision_pos_enc', 'backbone_fpn'])
         """
         super().__init__()
         self.trunk = trunk
@@ -513,11 +473,10 @@ class ImageEncoder(nn.Module):
 class FpnNeck(nn.Module):
-    """
-    A Feature Pyramid Network (FPN) neck variant for multiscale feature fusion in object detection models.
+    """A Feature Pyramid Network (FPN) neck variant for multiscale feature fusion in object detection models.
-    This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
-    similar to ViT positional embedding interpolation.
+    This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing, similar to ViT
+    positional embedding interpolation.
     Attributes:
         position_encoding (PositionEmbeddingSine): Sinusoidal positional encoding module.
@@ -550,11 +509,10 @@ class FpnNeck(nn.Module):
         fuse_type: str = "sum",
         fpn_top_down_levels: list[int] | None = None,
     ):
-        """
-        Initialize a modified Feature Pyramid Network (FPN) neck.
+        """Initialize a modified Feature Pyramid Network (FPN) neck.
-        This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
-        similar to ViT positional embedding interpolation.
+        This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing, similar to
+        ViT positional embedding interpolation.
         Args:
             d_model (int): Dimension of the model.
@@ -565,11 +523,6 @@ class FpnNeck(nn.Module):
             fpn_interp_model (str): Interpolation mode for FPN feature resizing.
             fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
             fpn_top_down_levels (Optional[list[int]]): Levels to have top-down features in outputs.
-        Examples:
-            >>> backbone_channels = [64, 128, 256, 512]
-            >>> fpn_neck = FpnNeck(256, backbone_channels)
-            >>> print(fpn_neck)
         """
         super().__init__()
         self.position_encoding = PositionEmbeddingSine(num_pos_feats=256)
@@ -603,8 +556,7 @@ class FpnNeck(nn.Module):
         self.fpn_top_down_levels = list(fpn_top_down_levels)
     def forward(self, xs: list[torch.Tensor]):
-        """
-        Perform forward pass through the Feature Pyramid Network (FPN) neck.
+        """Perform forward pass through the Feature Pyramid Network (FPN) neck.
         This method processes a list of input tensors from the backbone through the FPN, applying lateral connections
         and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
@@ -613,8 +565,8 @@ class FpnNeck(nn.Module):
             xs (list[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
         Returns:
-            out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape
-                (B, d_model, H, W).
+            out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape (B, d_model, H,
+                W).
             pos (list[torch.Tensor]): List of positional encodings corresponding to each output feature map.
         Examples:
@@ -656,12 +608,11 @@ class FpnNeck(nn.Module):
 class Hiera(nn.Module):
-    """
-    Hierarchical vision transformer for efficient multiscale feature extraction in image processing tasks.
+    """Hierarchical vision transformer for efficient multiscale feature extraction in image processing tasks.
-    This class implements a Hiera model, which is a hierarchical vision transformer architecture designed for
-    efficient multiscale feature extraction. It uses a series of transformer blocks organized into stages,
-    with optional pooling and global attention mechanisms.
+    This class implements a Hiera model, which is a hierarchical vision transformer architecture designed for efficient
+    multiscale feature extraction. It uses a series of transformer blocks organized into stages, with optional pooling
+    and global attention mechanisms.
     Attributes:
         window_spec (tuple[int, ...]): Window sizes for each stage.
@@ -715,12 +666,11 @@ class Hiera(nn.Module):
         ),
         return_interm_layers=True,  # return feats from every stage
     ):
-        """
-        Initialize a Hiera model, a hierarchical vision transformer for efficient multiscale feature extraction.
+        """Initialize a Hiera model, a hierarchical vision transformer for efficient multiscale feature extraction.
-        Hiera is a hierarchical vision transformer architecture designed for efficient multiscale feature extraction
-        in image processing tasks. It uses a series of transformer blocks organized into stages, with optional
-        pooling and global attention mechanisms.
+        Hiera is a hierarchical vision transformer architecture designed for efficient multiscale feature extraction in
+        image processing tasks. It uses a series of transformer blocks organized into stages, with optional pooling and
+        global attention mechanisms.
         Args:
             embed_dim (int): Initial embedding dimension for the model.
@@ -731,17 +681,11 @@ class Hiera(nn.Module):
             stages (tuple[int, ...]): Number of blocks per stage.
             dim_mul (float): Dimension multiplier factor at stage transitions.
             head_mul (float): Head multiplier factor at stage transitions.
-            window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
+            window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding
+                background.
             window_spec (tuple[int, ...]): Window sizes for each stage when not using global attention.
             global_att_blocks (tuple[int, ...]): Indices of blocks that use global attention.
             return_interm_layers (bool): Whether to return intermediate layer outputs.
-        Examples:
-            >>> model = Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3))
-            >>> input_tensor = torch.randn(1, 3, 224, 224)
-            >>> output_features = model(input_tensor)
-            >>> for feat in output_features:
-            ...     print(feat.shape)
         """
         super().__init__()
@@ -816,8 +760,7 @@ class Hiera(nn.Module):
         return pos_embed
     def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
-        """
-        Perform forward pass through Hiera model, extracting multiscale features from input images.
+        """Perform forward pass through Hiera model, extracting multiscale features from input images.
         Args:
             x (torch.Tensor): Input tensor with shape (B, C, H, W) representing a batch of images.

ultralytics/models/sam/modules/memory_attention.py CHANGED Viewed

@@ -11,8 +11,7 @@ from .blocks import RoPEAttention
 class MemoryAttentionLayer(nn.Module):
-    """
-    Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.
+    """Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.
     This class combines self-attention, cross-attention, and feedforward components to process input tensors and
     generate memory-based attention outputs.
@@ -60,9 +59,10 @@ class MemoryAttentionLayer(nn.Module):
         pos_enc_at_attn: bool = False,
         pos_enc_at_cross_attn_keys: bool = True,
         pos_enc_at_cross_attn_queries: bool = False,
+        self_attn: nn.Module | None = None,
+        cross_attn: nn.Module | None = None,
     ):
-        """
-        Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.
+        """Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.
         Args:
             d_model (int): Dimensionality of the model.
@@ -71,13 +71,15 @@ class MemoryAttentionLayer(nn.Module):
             pos_enc_at_attn (bool): Whether to add positional encoding at attention.
             pos_enc_at_cross_attn_keys (bool): Whether to add positional encoding to cross-attention keys.
             pos_enc_at_cross_attn_queries (bool): Whether to add positional encoding to cross-attention queries.
+            self_attn (nn.Module | None): Custom self-attention module. If None, a default RoPEAttention is used.
+            cross_attn (nn.Module | None): Custom cross-attention module. If None, a default RoPEAttention is used.
         """
         super().__init__()
         self.d_model = d_model
         self.dim_feedforward = dim_feedforward
         self.dropout_value = dropout
-        self.self_attn = RoPEAttention(embedding_dim=256, num_heads=1, downsample_rate=1)
-        self.cross_attn_image = RoPEAttention(
+        self.self_attn = self_attn or RoPEAttention(embedding_dim=256, num_heads=1, downsample_rate=1)
+        self.cross_attn_image = cross_attn or RoPEAttention(
             rope_k_repeat=True,
             embedding_dim=256,
             num_heads=1,
@@ -145,8 +147,7 @@ class MemoryAttentionLayer(nn.Module):
         query_pos: torch.Tensor | None = None,
         num_k_exclude_rope: int = 0,
     ) -> torch.Tensor:
-        """
-        Process input tensors through self-attention, cross-attention, and feedforward network layers.
+        """Process input tensors through self-attention, cross-attention, and feedforward network layers.
         Args:
             tgt (torch.Tensor): Target tensor for self-attention with shape (N, L, D).
@@ -168,11 +169,10 @@ class MemoryAttentionLayer(nn.Module):
 class MemoryAttention(nn.Module):
-    """
-    Memory attention module for processing sequential data with self and cross-attention mechanisms.
+    """Memory attention module for processing sequential data with self and cross-attention mechanisms.
-    This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
-    for processing sequential data, particularly useful in transformer-like architectures.
+    This class implements a multi-layer attention mechanism that combines self-attention and cross-attention for
+    processing sequential data, particularly useful in transformer-like architectures.
     Attributes:
         d_model (int): The dimension of the model's hidden state.
@@ -206,11 +206,10 @@ class MemoryAttention(nn.Module):
         num_layers: int,
         batch_first: bool = True,  # Do layers expect batch first input?
     ):
-        """
-        Initialize MemoryAttention with specified layers and normalization for sequential data processing.
+        """Initialize MemoryAttention with specified layers and normalization for sequential data processing.
-        This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
-        for processing sequential data, particularly useful in transformer-like architectures.
+        This class implements a multi-layer attention mechanism that combines self-attention and cross-attention for
+        processing sequential data, particularly useful in transformer-like architectures.
         Args:
             d_model (int): The dimension of the model's hidden state.
@@ -218,18 +217,6 @@ class MemoryAttention(nn.Module):
             layer (nn.Module): The attention layer to be used in the module.
             num_layers (int): The number of attention layers.
             batch_first (bool): Whether the input tensors are in batch-first format.
-        Examples:
-            >>> d_model = 256
-            >>> layer = MemoryAttentionLayer(d_model)
-            >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
-            >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
-            >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
-            >>> curr_pos = torch.randn(10, 32, d_model)
-            >>> memory_pos = torch.randn(20, 32, d_model)
-            >>> output = attention(curr, memory, curr_pos, memory_pos)
-            >>> print(output.shape)
-            torch.Size([10, 32, 256])
         """
         super().__init__()
         self.d_model = d_model
@@ -247,8 +234,7 @@ class MemoryAttention(nn.Module):
         memory_pos: torch.Tensor | None = None,  # pos_enc for cross-attention inputs
         num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
     ) -> torch.Tensor:
-        """
-        Process inputs through attention layers, applying self and cross-attention with positional encoding.
+        """Process inputs through attention layers, applying self and cross-attention with positional encoding.
         Args:
             curr (torch.Tensor): Self-attention input tensor, representing the current state.

dgenerate-ultralytics-headless 8.3.214__py3-none-any.whl → 8.4.7__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.214py3-none-any.whl → 8.4.7py3-none-any.whl