dgenerate-ultralytics-headless 8.3.190__py3-none-any.whl → 8.3.192__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/METADATA +1 -1
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/RECORD +103 -102
- tests/test_cuda.py +6 -5
- tests/test_exports.py +1 -6
- tests/test_python.py +1 -4
- tests/test_solutions.py +1 -1
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +16 -14
- ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
- ultralytics/cfg/datasets/VisDrone.yaml +4 -4
- ultralytics/data/annotator.py +6 -6
- ultralytics/data/augment.py +53 -51
- ultralytics/data/base.py +15 -13
- ultralytics/data/build.py +7 -4
- ultralytics/data/converter.py +9 -10
- ultralytics/data/dataset.py +24 -22
- ultralytics/data/loaders.py +13 -11
- ultralytics/data/split.py +4 -3
- ultralytics/data/split_dota.py +14 -12
- ultralytics/data/utils.py +29 -23
- ultralytics/engine/exporter.py +2 -2
- ultralytics/engine/model.py +16 -14
- ultralytics/engine/predictor.py +8 -6
- ultralytics/engine/results.py +54 -52
- ultralytics/engine/trainer.py +8 -3
- ultralytics/engine/tuner.py +230 -42
- ultralytics/hub/google/__init__.py +7 -6
- ultralytics/hub/session.py +8 -6
- ultralytics/hub/utils.py +3 -4
- ultralytics/models/fastsam/model.py +8 -6
- ultralytics/models/nas/model.py +5 -3
- ultralytics/models/rtdetr/train.py +4 -3
- ultralytics/models/rtdetr/val.py +6 -4
- ultralytics/models/sam/amg.py +13 -10
- ultralytics/models/sam/model.py +3 -2
- ultralytics/models/sam/modules/blocks.py +21 -21
- ultralytics/models/sam/modules/decoders.py +11 -11
- ultralytics/models/sam/modules/encoders.py +25 -25
- ultralytics/models/sam/modules/memory_attention.py +9 -8
- ultralytics/models/sam/modules/sam.py +8 -10
- ultralytics/models/sam/modules/tiny_encoder.py +21 -20
- ultralytics/models/sam/modules/transformer.py +6 -5
- ultralytics/models/sam/modules/utils.py +7 -5
- ultralytics/models/sam/predict.py +32 -31
- ultralytics/models/utils/loss.py +29 -27
- ultralytics/models/utils/ops.py +10 -8
- ultralytics/models/yolo/classify/train.py +9 -7
- ultralytics/models/yolo/classify/val.py +11 -9
- ultralytics/models/yolo/detect/predict.py +1 -1
- ultralytics/models/yolo/detect/train.py +8 -6
- ultralytics/models/yolo/detect/val.py +22 -20
- ultralytics/models/yolo/model.py +14 -14
- ultralytics/models/yolo/obb/train.py +5 -3
- ultralytics/models/yolo/obb/val.py +11 -9
- ultralytics/models/yolo/pose/train.py +7 -5
- ultralytics/models/yolo/pose/val.py +12 -10
- ultralytics/models/yolo/segment/train.py +4 -5
- ultralytics/models/yolo/segment/val.py +13 -11
- ultralytics/models/yolo/world/train.py +10 -8
- ultralytics/models/yolo/yoloe/train.py +10 -10
- ultralytics/models/yolo/yoloe/val.py +11 -9
- ultralytics/nn/autobackend.py +17 -19
- ultralytics/nn/modules/block.py +12 -12
- ultralytics/nn/modules/conv.py +4 -3
- ultralytics/nn/modules/head.py +41 -37
- ultralytics/nn/modules/transformer.py +22 -21
- ultralytics/nn/tasks.py +2 -2
- ultralytics/nn/text_model.py +6 -5
- ultralytics/solutions/analytics.py +7 -5
- ultralytics/solutions/config.py +12 -10
- ultralytics/solutions/distance_calculation.py +3 -3
- ultralytics/solutions/heatmap.py +4 -2
- ultralytics/solutions/object_counter.py +5 -3
- ultralytics/solutions/parking_management.py +4 -2
- ultralytics/solutions/region_counter.py +7 -5
- ultralytics/solutions/similarity_search.py +5 -3
- ultralytics/solutions/solutions.py +38 -36
- ultralytics/solutions/streamlit_inference.py +8 -7
- ultralytics/trackers/bot_sort.py +11 -9
- ultralytics/trackers/byte_tracker.py +17 -15
- ultralytics/trackers/utils/gmc.py +4 -3
- ultralytics/utils/__init__.py +16 -88
- ultralytics/utils/autobatch.py +3 -2
- ultralytics/utils/autodevice.py +10 -10
- ultralytics/utils/benchmarks.py +11 -10
- ultralytics/utils/callbacks/comet.py +9 -9
- ultralytics/utils/checks.py +17 -26
- ultralytics/utils/export.py +12 -11
- ultralytics/utils/files.py +8 -7
- ultralytics/utils/git.py +139 -0
- ultralytics/utils/instance.py +8 -7
- ultralytics/utils/loss.py +15 -13
- ultralytics/utils/metrics.py +62 -62
- ultralytics/utils/ops.py +3 -2
- ultralytics/utils/patches.py +6 -4
- ultralytics/utils/plotting.py +20 -18
- ultralytics/utils/torch_utils.py +4 -2
- ultralytics/utils/tqdm.py +18 -14
- ultralytics/utils/triton.py +3 -2
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/top_level.txt +0 -0
ultralytics/models/sam/amg.py
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
import math
|
6
|
+
from collections.abc import Generator
|
4
7
|
from itertools import product
|
5
|
-
from typing import Any
|
8
|
+
from typing import Any
|
6
9
|
|
7
10
|
import numpy as np
|
8
11
|
import torch
|
9
12
|
|
10
13
|
|
11
14
|
def is_box_near_crop_edge(
|
12
|
-
boxes: torch.Tensor, crop_box:
|
15
|
+
boxes: torch.Tensor, crop_box: list[int], orig_box: list[int], atol: float = 20.0
|
13
16
|
) -> torch.Tensor:
|
14
17
|
"""
|
15
18
|
Determine if bounding boxes are near the edge of a cropped image region using a specified tolerance.
|
@@ -38,7 +41,7 @@ def is_box_near_crop_edge(
|
|
38
41
|
return torch.any(near_crop_edge, dim=1)
|
39
42
|
|
40
43
|
|
41
|
-
def batch_iterator(batch_size: int, *args) -> Generator[
|
44
|
+
def batch_iterator(batch_size: int, *args) -> Generator[list[Any]]:
|
42
45
|
"""
|
43
46
|
Yield batches of data from input arguments with specified batch size for efficient processing.
|
44
47
|
|
@@ -106,14 +109,14 @@ def build_point_grid(n_per_side: int) -> np.ndarray:
|
|
106
109
|
return np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
|
107
110
|
|
108
111
|
|
109
|
-
def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) ->
|
112
|
+
def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) -> list[np.ndarray]:
|
110
113
|
"""Generate point grids for multiple crop layers with varying scales and densities."""
|
111
114
|
return [build_point_grid(int(n_per_side / (scale_per_layer**i))) for i in range(n_layers + 1)]
|
112
115
|
|
113
116
|
|
114
117
|
def generate_crop_boxes(
|
115
|
-
im_size:
|
116
|
-
) ->
|
118
|
+
im_size: tuple[int, ...], n_layers: int, overlap_ratio: float
|
119
|
+
) -> tuple[list[list[int]], list[int]]:
|
117
120
|
"""
|
118
121
|
Generate crop boxes of varying sizes for multiscale image processing, with layered overlapping regions.
|
119
122
|
|
@@ -163,7 +166,7 @@ def generate_crop_boxes(
|
|
163
166
|
return crop_boxes, layer_idxs
|
164
167
|
|
165
168
|
|
166
|
-
def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box:
|
169
|
+
def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: list[int]) -> torch.Tensor:
|
167
170
|
"""Uncrop bounding boxes by adding the crop box offset to their coordinates."""
|
168
171
|
x0, y0, _, _ = crop_box
|
169
172
|
offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
|
@@ -173,7 +176,7 @@ def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
|
|
173
176
|
return boxes + offset
|
174
177
|
|
175
178
|
|
176
|
-
def uncrop_points(points: torch.Tensor, crop_box:
|
179
|
+
def uncrop_points(points: torch.Tensor, crop_box: list[int]) -> torch.Tensor:
|
177
180
|
"""Uncrop points by adding the crop box offset to their coordinates."""
|
178
181
|
x0, y0, _, _ = crop_box
|
179
182
|
offset = torch.tensor([[x0, y0]], device=points.device)
|
@@ -183,7 +186,7 @@ def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
|
|
183
186
|
return points + offset
|
184
187
|
|
185
188
|
|
186
|
-
def uncrop_masks(masks: torch.Tensor, crop_box:
|
189
|
+
def uncrop_masks(masks: torch.Tensor, crop_box: list[int], orig_h: int, orig_w: int) -> torch.Tensor:
|
187
190
|
"""Uncrop masks by padding them to the original image size, handling coordinate transformations."""
|
188
191
|
x0, y0, x1, y1 = crop_box
|
189
192
|
if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
|
@@ -194,7 +197,7 @@ def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w:
|
|
194
197
|
return torch.nn.functional.pad(masks, pad, value=0)
|
195
198
|
|
196
199
|
|
197
|
-
def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) ->
|
200
|
+
def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> tuple[np.ndarray, bool]:
|
198
201
|
"""
|
199
202
|
Remove small disconnected regions or holes in a mask based on area threshold and mode.
|
200
203
|
|
ultralytics/models/sam/model.py
CHANGED
@@ -14,8 +14,9 @@ Key Features:
|
|
14
14
|
- Trained on SA-1B dataset
|
15
15
|
"""
|
16
16
|
|
17
|
+
from __future__ import annotations
|
18
|
+
|
17
19
|
from pathlib import Path
|
18
|
-
from typing import Dict, Type
|
19
20
|
|
20
21
|
from ultralytics.engine.model import Model
|
21
22
|
from ultralytics.utils.torch_utils import model_info
|
@@ -154,7 +155,7 @@ class SAM(Model):
|
|
154
155
|
return model_info(self.model, detailed=detailed, verbose=verbose)
|
155
156
|
|
156
157
|
@property
|
157
|
-
def task_map(self) ->
|
158
|
+
def task_map(self) -> dict[str, dict[str, type[Predictor]]]:
|
158
159
|
"""
|
159
160
|
Provide a mapping from the 'segment' task to its corresponding 'Predictor'.
|
160
161
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
|
+
from __future__ import annotations
|
2
3
|
|
3
4
|
import copy
|
4
5
|
import math
|
5
6
|
from functools import partial
|
6
|
-
from typing import Optional, Tuple, Type, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
import torch
|
@@ -81,7 +81,7 @@ class MaskDownSampler(nn.Module):
|
|
81
81
|
stride: int = 4,
|
82
82
|
padding: int = 0,
|
83
83
|
total_stride: int = 16,
|
84
|
-
activation:
|
84
|
+
activation: type[nn.Module] = nn.GELU,
|
85
85
|
):
|
86
86
|
"""Initialize a mask downsampler module for progressive downsampling and channel expansion."""
|
87
87
|
super().__init__()
|
@@ -227,7 +227,7 @@ class Fuser(nn.Module):
|
|
227
227
|
torch.Size([1, 256, 32, 32])
|
228
228
|
"""
|
229
229
|
|
230
|
-
def __init__(self, layer: nn.Module, num_layers: int, dim:
|
230
|
+
def __init__(self, layer: nn.Module, num_layers: int, dim: int | None = None, input_projection: bool = False):
|
231
231
|
"""
|
232
232
|
Initialize the Fuser module for feature fusion through multiple layers.
|
233
233
|
|
@@ -295,7 +295,7 @@ class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
|
|
295
295
|
embedding_dim: int,
|
296
296
|
num_heads: int,
|
297
297
|
mlp_dim: int = 2048,
|
298
|
-
activation:
|
298
|
+
activation: type[nn.Module] = nn.ReLU,
|
299
299
|
attention_downsample_rate: int = 2,
|
300
300
|
skip_first_layer_pe: bool = False,
|
301
301
|
) -> None:
|
@@ -359,7 +359,7 @@ class SAM2TwoWayTransformer(TwoWayTransformer):
|
|
359
359
|
embedding_dim: int,
|
360
360
|
num_heads: int,
|
361
361
|
mlp_dim: int,
|
362
|
-
activation:
|
362
|
+
activation: type[nn.Module] = nn.ReLU,
|
363
363
|
attention_downsample_rate: int = 2,
|
364
364
|
) -> None:
|
365
365
|
"""
|
@@ -432,7 +432,7 @@ class RoPEAttention(Attention):
|
|
432
432
|
*args,
|
433
433
|
rope_theta: float = 10000.0,
|
434
434
|
rope_k_repeat: bool = False,
|
435
|
-
feat_sizes:
|
435
|
+
feat_sizes: tuple[int, int] = (32, 32), # [w, h] for stride 16 feats at 512 resolution
|
436
436
|
**kwargs,
|
437
437
|
):
|
438
438
|
"""Initialize RoPEAttention with rotary position encoding for enhanced positional awareness."""
|
@@ -618,9 +618,9 @@ class MultiScaleBlock(nn.Module):
|
|
618
618
|
num_heads: int,
|
619
619
|
mlp_ratio: float = 4.0,
|
620
620
|
drop_path: float = 0.0,
|
621
|
-
norm_layer:
|
622
|
-
q_stride:
|
623
|
-
act_layer:
|
621
|
+
norm_layer: nn.Module | str = "LayerNorm",
|
622
|
+
q_stride: tuple[int, int] = None,
|
623
|
+
act_layer: type[nn.Module] = nn.GELU,
|
624
624
|
window_size: int = 0,
|
625
625
|
):
|
626
626
|
"""Initialize a multiscale attention block with window partitioning and optional query pooling."""
|
@@ -728,7 +728,7 @@ class PositionEmbeddingSine(nn.Module):
|
|
728
728
|
num_pos_feats: int,
|
729
729
|
temperature: int = 10000,
|
730
730
|
normalize: bool = True,
|
731
|
-
scale:
|
731
|
+
scale: float | None = None,
|
732
732
|
):
|
733
733
|
"""Initialize sinusoidal position embeddings for 2D image inputs."""
|
734
734
|
super().__init__()
|
@@ -744,7 +744,7 @@ class PositionEmbeddingSine(nn.Module):
|
|
744
744
|
|
745
745
|
self.cache = {}
|
746
746
|
|
747
|
-
def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) ->
|
747
|
+
def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
748
748
|
"""Encode 2D positions using sine/cosine functions for transformer positional embeddings."""
|
749
749
|
assert len(x) == len(y) and x.ndim == y.ndim == 1
|
750
750
|
x_embed = x * self.scale
|
@@ -833,7 +833,7 @@ class PositionEmbeddingRandom(nn.Module):
|
|
833
833
|
torch.Size([128, 32, 32])
|
834
834
|
"""
|
835
835
|
|
836
|
-
def __init__(self, num_pos_feats: int = 64, scale:
|
836
|
+
def __init__(self, num_pos_feats: int = 64, scale: float | None = None) -> None:
|
837
837
|
"""Initialize random spatial frequency position embedding for transformers."""
|
838
838
|
super().__init__()
|
839
839
|
if scale is None or scale <= 0.0:
|
@@ -853,7 +853,7 @@ class PositionEmbeddingRandom(nn.Module):
|
|
853
853
|
# Outputs d_1 x ... x d_n x C shape
|
854
854
|
return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
|
855
855
|
|
856
|
-
def forward(self, size:
|
856
|
+
def forward(self, size: tuple[int, int]) -> torch.Tensor:
|
857
857
|
"""Generate positional encoding for a grid using random spatial frequencies."""
|
858
858
|
h, w = size
|
859
859
|
grid = torch.ones(
|
@@ -869,7 +869,7 @@ class PositionEmbeddingRandom(nn.Module):
|
|
869
869
|
pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
|
870
870
|
return pe.permute(2, 0, 1) # C x H x W
|
871
871
|
|
872
|
-
def forward_with_coords(self, coords_input: torch.Tensor, image_size:
|
872
|
+
def forward_with_coords(self, coords_input: torch.Tensor, image_size: tuple[int, int]) -> torch.Tensor:
|
873
873
|
"""Positionally encode input coordinates, normalizing them to [0,1] based on the given image size."""
|
874
874
|
coords = coords_input.clone()
|
875
875
|
coords[:, :, 0] = coords[:, :, 0] / image_size[1]
|
@@ -910,12 +910,12 @@ class Block(nn.Module):
|
|
910
910
|
num_heads: int,
|
911
911
|
mlp_ratio: float = 4.0,
|
912
912
|
qkv_bias: bool = True,
|
913
|
-
norm_layer:
|
914
|
-
act_layer:
|
913
|
+
norm_layer: type[nn.Module] = nn.LayerNorm,
|
914
|
+
act_layer: type[nn.Module] = nn.GELU,
|
915
915
|
use_rel_pos: bool = False,
|
916
916
|
rel_pos_zero_init: bool = True,
|
917
917
|
window_size: int = 0,
|
918
|
-
input_size:
|
918
|
+
input_size: tuple[int, int] | None = None,
|
919
919
|
) -> None:
|
920
920
|
"""
|
921
921
|
Initialize a transformer block with optional window attention and relative positional embeddings.
|
@@ -1012,7 +1012,7 @@ class REAttention(nn.Module):
|
|
1012
1012
|
qkv_bias: bool = True,
|
1013
1013
|
use_rel_pos: bool = False,
|
1014
1014
|
rel_pos_zero_init: bool = True,
|
1015
|
-
input_size:
|
1015
|
+
input_size: tuple[int, int] | None = None,
|
1016
1016
|
) -> None:
|
1017
1017
|
"""
|
1018
1018
|
Initialize a Relative Position Attention module for transformer-based architectures.
|
@@ -1093,9 +1093,9 @@ class PatchEmbed(nn.Module):
|
|
1093
1093
|
|
1094
1094
|
def __init__(
|
1095
1095
|
self,
|
1096
|
-
kernel_size:
|
1097
|
-
stride:
|
1098
|
-
padding:
|
1096
|
+
kernel_size: tuple[int, int] = (16, 16),
|
1097
|
+
stride: tuple[int, int] = (16, 16),
|
1098
|
+
padding: tuple[int, int] = (0, 0),
|
1099
1099
|
in_chans: int = 3,
|
1100
1100
|
embed_dim: int = 768,
|
1101
1101
|
) -> None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
4
|
|
5
5
|
import torch
|
6
6
|
from torch import nn
|
@@ -43,7 +43,7 @@ class MaskDecoder(nn.Module):
|
|
43
43
|
transformer_dim: int,
|
44
44
|
transformer: nn.Module,
|
45
45
|
num_multimask_outputs: int = 3,
|
46
|
-
activation:
|
46
|
+
activation: type[nn.Module] = nn.GELU,
|
47
47
|
iou_head_depth: int = 3,
|
48
48
|
iou_head_hidden_dim: int = 256,
|
49
49
|
) -> None:
|
@@ -93,7 +93,7 @@ class MaskDecoder(nn.Module):
|
|
93
93
|
sparse_prompt_embeddings: torch.Tensor,
|
94
94
|
dense_prompt_embeddings: torch.Tensor,
|
95
95
|
multimask_output: bool,
|
96
|
-
) ->
|
96
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
97
97
|
"""
|
98
98
|
Predict masks given image and prompt embeddings.
|
99
99
|
|
@@ -137,7 +137,7 @@ class MaskDecoder(nn.Module):
|
|
137
137
|
image_pe: torch.Tensor,
|
138
138
|
sparse_prompt_embeddings: torch.Tensor,
|
139
139
|
dense_prompt_embeddings: torch.Tensor,
|
140
|
-
) ->
|
140
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
141
141
|
"""Predict masks and quality scores using image and prompt embeddings via transformer architecture."""
|
142
142
|
# Concatenate output tokens
|
143
143
|
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
|
@@ -158,7 +158,7 @@ class MaskDecoder(nn.Module):
|
|
158
158
|
# Upscale mask embeddings and predict masks using the mask tokens
|
159
159
|
src = src.transpose(1, 2).view(b, c, h, w)
|
160
160
|
upscaled_embedding = self.output_upscaling(src)
|
161
|
-
hyper_in_list:
|
161
|
+
hyper_in_list: list[torch.Tensor] = [
|
162
162
|
self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
|
163
163
|
]
|
164
164
|
hyper_in = torch.stack(hyper_in_list, dim=1)
|
@@ -221,7 +221,7 @@ class SAM2MaskDecoder(nn.Module):
|
|
221
221
|
transformer_dim: int,
|
222
222
|
transformer: nn.Module,
|
223
223
|
num_multimask_outputs: int = 3,
|
224
|
-
activation:
|
224
|
+
activation: type[nn.Module] = nn.GELU,
|
225
225
|
iou_head_depth: int = 3,
|
226
226
|
iou_head_hidden_dim: int = 256,
|
227
227
|
use_high_res_features: bool = False,
|
@@ -317,8 +317,8 @@ class SAM2MaskDecoder(nn.Module):
|
|
317
317
|
dense_prompt_embeddings: torch.Tensor,
|
318
318
|
multimask_output: bool,
|
319
319
|
repeat_image: bool,
|
320
|
-
high_res_features:
|
321
|
-
) ->
|
320
|
+
high_res_features: list[torch.Tensor] | None = None,
|
321
|
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
322
322
|
"""
|
323
323
|
Predict masks given image and prompt embeddings.
|
324
324
|
|
@@ -385,8 +385,8 @@ class SAM2MaskDecoder(nn.Module):
|
|
385
385
|
sparse_prompt_embeddings: torch.Tensor,
|
386
386
|
dense_prompt_embeddings: torch.Tensor,
|
387
387
|
repeat_image: bool,
|
388
|
-
high_res_features:
|
389
|
-
) ->
|
388
|
+
high_res_features: list[torch.Tensor] | None = None,
|
389
|
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
390
390
|
"""Predict instance segmentation masks from image and prompt embeddings using a transformer."""
|
391
391
|
# Concatenate output tokens
|
392
392
|
s = 0
|
@@ -431,7 +431,7 @@ class SAM2MaskDecoder(nn.Module):
|
|
431
431
|
upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
|
432
432
|
upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
|
433
433
|
|
434
|
-
hyper_in_list:
|
434
|
+
hyper_in_list: list[torch.Tensor] = [
|
435
435
|
self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
|
436
436
|
]
|
437
437
|
hyper_in = torch.stack(hyper_in_list, dim=1)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
4
|
|
5
5
|
import torch
|
6
6
|
import torch.nn as nn
|
@@ -56,13 +56,13 @@ class ImageEncoderViT(nn.Module):
|
|
56
56
|
mlp_ratio: float = 4.0,
|
57
57
|
out_chans: int = 256,
|
58
58
|
qkv_bias: bool = True,
|
59
|
-
norm_layer:
|
60
|
-
act_layer:
|
59
|
+
norm_layer: type[nn.Module] = nn.LayerNorm,
|
60
|
+
act_layer: type[nn.Module] = nn.GELU,
|
61
61
|
use_abs_pos: bool = True,
|
62
62
|
use_rel_pos: bool = False,
|
63
63
|
rel_pos_zero_init: bool = True,
|
64
64
|
window_size: int = 0,
|
65
|
-
global_attn_indexes:
|
65
|
+
global_attn_indexes: tuple[int, ...] = (),
|
66
66
|
) -> None:
|
67
67
|
"""
|
68
68
|
Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
|
@@ -101,7 +101,7 @@ class ImageEncoderViT(nn.Module):
|
|
101
101
|
embed_dim=embed_dim,
|
102
102
|
)
|
103
103
|
|
104
|
-
self.pos_embed:
|
104
|
+
self.pos_embed: nn.Parameter | None = None
|
105
105
|
if use_abs_pos:
|
106
106
|
# Initialize absolute positional embedding with pretrain image size
|
107
107
|
self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
|
@@ -188,10 +188,10 @@ class PromptEncoder(nn.Module):
|
|
188
188
|
def __init__(
|
189
189
|
self,
|
190
190
|
embed_dim: int,
|
191
|
-
image_embedding_size:
|
192
|
-
input_image_size:
|
191
|
+
image_embedding_size: tuple[int, int],
|
192
|
+
input_image_size: tuple[int, int],
|
193
193
|
mask_in_chans: int,
|
194
|
-
activation:
|
194
|
+
activation: type[nn.Module] = nn.GELU,
|
195
195
|
) -> None:
|
196
196
|
"""
|
197
197
|
Initialize the PromptEncoder module for encoding various types of prompts.
|
@@ -286,9 +286,9 @@ class PromptEncoder(nn.Module):
|
|
286
286
|
|
287
287
|
@staticmethod
|
288
288
|
def _get_batch_size(
|
289
|
-
points:
|
290
|
-
boxes:
|
291
|
-
masks:
|
289
|
+
points: tuple[torch.Tensor, torch.Tensor] | None,
|
290
|
+
boxes: torch.Tensor | None,
|
291
|
+
masks: torch.Tensor | None,
|
292
292
|
) -> int:
|
293
293
|
"""Get the batch size of the output given the batch size of the input prompts."""
|
294
294
|
if points is not None:
|
@@ -302,10 +302,10 @@ class PromptEncoder(nn.Module):
|
|
302
302
|
|
303
303
|
def forward(
|
304
304
|
self,
|
305
|
-
points:
|
306
|
-
boxes:
|
307
|
-
masks:
|
308
|
-
) ->
|
305
|
+
points: tuple[torch.Tensor, torch.Tensor] | None,
|
306
|
+
boxes: torch.Tensor | None,
|
307
|
+
masks: torch.Tensor | None,
|
308
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
309
309
|
"""
|
310
310
|
Embed different types of prompts, returning both sparse and dense embeddings.
|
311
311
|
|
@@ -542,13 +542,13 @@ class FpnNeck(nn.Module):
|
|
542
542
|
def __init__(
|
543
543
|
self,
|
544
544
|
d_model: int,
|
545
|
-
backbone_channel_list:
|
545
|
+
backbone_channel_list: list[int],
|
546
546
|
kernel_size: int = 1,
|
547
547
|
stride: int = 1,
|
548
548
|
padding: int = 0,
|
549
549
|
fpn_interp_model: str = "bilinear",
|
550
550
|
fuse_type: str = "sum",
|
551
|
-
fpn_top_down_levels:
|
551
|
+
fpn_top_down_levels: list[int] | None = None,
|
552
552
|
):
|
553
553
|
"""
|
554
554
|
Initialize a modified Feature Pyramid Network (FPN) neck.
|
@@ -602,7 +602,7 @@ class FpnNeck(nn.Module):
|
|
602
602
|
fpn_top_down_levels = range(len(self.convs))
|
603
603
|
self.fpn_top_down_levels = list(fpn_top_down_levels)
|
604
604
|
|
605
|
-
def forward(self, xs:
|
605
|
+
def forward(self, xs: list[torch.Tensor]):
|
606
606
|
"""
|
607
607
|
Perform forward pass through the Feature Pyramid Network (FPN) neck.
|
608
608
|
|
@@ -695,20 +695,20 @@ class Hiera(nn.Module):
|
|
695
695
|
num_heads: int = 1, # initial number of heads
|
696
696
|
drop_path_rate: float = 0.0, # stochastic depth
|
697
697
|
q_pool: int = 3, # number of q_pool stages
|
698
|
-
q_stride:
|
699
|
-
stages:
|
698
|
+
q_stride: tuple[int, int] = (2, 2), # downsample stride bet. stages
|
699
|
+
stages: tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
|
700
700
|
dim_mul: float = 2.0, # dim_mul factor at stage shift
|
701
701
|
head_mul: float = 2.0, # head_mul factor at stage shift
|
702
|
-
window_pos_embed_bkg_spatial_size:
|
702
|
+
window_pos_embed_bkg_spatial_size: tuple[int, int] = (14, 14),
|
703
703
|
# window size per stage, when not using global att.
|
704
|
-
window_spec:
|
704
|
+
window_spec: tuple[int, ...] = (
|
705
705
|
8,
|
706
706
|
4,
|
707
707
|
14,
|
708
708
|
7,
|
709
709
|
),
|
710
710
|
# global attn in these blocks
|
711
|
-
global_att_blocks:
|
711
|
+
global_att_blocks: tuple[int, ...] = (
|
712
712
|
12,
|
713
713
|
16,
|
714
714
|
20,
|
@@ -806,7 +806,7 @@ class Hiera(nn.Module):
|
|
806
806
|
else [self.blocks[-1].dim_out]
|
807
807
|
)
|
808
808
|
|
809
|
-
def _get_pos_embed(self, hw:
|
809
|
+
def _get_pos_embed(self, hw: tuple[int, int]) -> torch.Tensor:
|
810
810
|
"""Generate positional embeddings by interpolating and combining window and background embeddings."""
|
811
811
|
h, w = hw
|
812
812
|
window_embed = self.pos_embed_window
|
@@ -815,7 +815,7 @@ class Hiera(nn.Module):
|
|
815
815
|
pos_embed = pos_embed.permute(0, 2, 3, 1)
|
816
816
|
return pos_embed
|
817
817
|
|
818
|
-
def forward(self, x: torch.Tensor) ->
|
818
|
+
def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
|
819
819
|
"""
|
820
820
|
Perform forward pass through Hiera model, extracting multiscale features from input images.
|
821
821
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
import copy
|
4
|
-
from typing import Optional
|
5
6
|
|
6
7
|
import torch
|
7
8
|
from torch import nn
|
@@ -103,7 +104,7 @@ class MemoryAttentionLayer(nn.Module):
|
|
103
104
|
self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
|
104
105
|
self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
|
105
106
|
|
106
|
-
def _forward_sa(self, tgt: torch.Tensor, query_pos:
|
107
|
+
def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
|
107
108
|
"""Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
|
108
109
|
tgt2 = self.norm1(tgt)
|
109
110
|
q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
|
@@ -115,8 +116,8 @@ class MemoryAttentionLayer(nn.Module):
|
|
115
116
|
self,
|
116
117
|
tgt: torch.Tensor,
|
117
118
|
memory: torch.Tensor,
|
118
|
-
query_pos:
|
119
|
-
pos:
|
119
|
+
query_pos: torch.Tensor | None,
|
120
|
+
pos: torch.Tensor | None,
|
120
121
|
num_k_exclude_rope: int = 0,
|
121
122
|
) -> torch.Tensor:
|
122
123
|
"""Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
|
@@ -140,8 +141,8 @@ class MemoryAttentionLayer(nn.Module):
|
|
140
141
|
self,
|
141
142
|
tgt: torch.Tensor,
|
142
143
|
memory: torch.Tensor,
|
143
|
-
pos:
|
144
|
-
query_pos:
|
144
|
+
pos: torch.Tensor | None = None,
|
145
|
+
query_pos: torch.Tensor | None = None,
|
145
146
|
num_k_exclude_rope: int = 0,
|
146
147
|
) -> torch.Tensor:
|
147
148
|
"""
|
@@ -242,8 +243,8 @@ class MemoryAttention(nn.Module):
|
|
242
243
|
self,
|
243
244
|
curr: torch.Tensor, # self-attention inputs
|
244
245
|
memory: torch.Tensor, # cross-attention inputs
|
245
|
-
curr_pos:
|
246
|
-
memory_pos:
|
246
|
+
curr_pos: torch.Tensor | None = None, # pos_enc for self-attention inputs
|
247
|
+
memory_pos: torch.Tensor | None = None, # pos_enc for cross-attention inputs
|
247
248
|
num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
|
248
249
|
) -> torch.Tensor:
|
249
250
|
"""
|
@@ -3,10 +3,7 @@
|
|
3
3
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
4
4
|
# All rights reserved.
|
5
5
|
|
6
|
-
|
7
|
-
# LICENSE file in the root directory of this source tree.
|
8
|
-
|
9
|
-
from typing import List
|
6
|
+
from __future__ import annotations
|
10
7
|
|
11
8
|
import torch
|
12
9
|
import torch.nn.functional as F
|
@@ -61,8 +58,8 @@ class SAMModel(nn.Module):
|
|
61
58
|
image_encoder: ImageEncoderViT,
|
62
59
|
prompt_encoder: PromptEncoder,
|
63
60
|
mask_decoder: MaskDecoder,
|
64
|
-
pixel_mean:
|
65
|
-
pixel_std:
|
61
|
+
pixel_mean: list[float] = (123.675, 116.28, 103.53),
|
62
|
+
pixel_std: list[float] = (58.395, 57.12, 57.375),
|
66
63
|
) -> None:
|
67
64
|
"""
|
68
65
|
Initialize the SAMModel class to predict object masks from an image and input prompts.
|
@@ -959,7 +956,6 @@ class SAM2Model(torch.nn.Module):
|
|
959
956
|
prev_sam_mask_logits=None,
|
960
957
|
):
|
961
958
|
"""Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
|
962
|
-
current_out = {}
|
963
959
|
sam_outputs, _, _ = self._track_step(
|
964
960
|
frame_idx,
|
965
961
|
is_init_cond_frame,
|
@@ -975,9 +971,11 @@ class SAM2Model(torch.nn.Module):
|
|
975
971
|
)
|
976
972
|
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
|
977
973
|
|
978
|
-
current_out
|
979
|
-
|
980
|
-
|
974
|
+
current_out = {
|
975
|
+
"pred_masks": low_res_masks,
|
976
|
+
"pred_masks_high_res": high_res_masks,
|
977
|
+
"obj_ptr": obj_ptr,
|
978
|
+
}
|
981
979
|
if not self.training:
|
982
980
|
# Only add this in inference (to avoid unused param in activation checkpointing;
|
983
981
|
# it's mainly used in the demo to encode spatial memories w/ consolidated masks)
|