dgenerate-ultralytics-headless 8.3.189__py3-none-any.whl → 8.3.191__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/METADATA +1 -1
- {dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/RECORD +111 -109
- tests/test_cuda.py +6 -5
- tests/test_exports.py +1 -6
- tests/test_python.py +1 -4
- tests/test_solutions.py +1 -1
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +16 -14
- ultralytics/cfg/datasets/VisDrone.yaml +4 -4
- ultralytics/data/annotator.py +6 -6
- ultralytics/data/augment.py +53 -51
- ultralytics/data/base.py +15 -13
- ultralytics/data/build.py +7 -4
- ultralytics/data/converter.py +9 -10
- ultralytics/data/dataset.py +24 -22
- ultralytics/data/loaders.py +13 -11
- ultralytics/data/split.py +4 -3
- ultralytics/data/split_dota.py +14 -12
- ultralytics/data/utils.py +31 -25
- ultralytics/engine/exporter.py +7 -4
- ultralytics/engine/model.py +16 -14
- ultralytics/engine/predictor.py +9 -7
- ultralytics/engine/results.py +59 -57
- ultralytics/engine/trainer.py +7 -0
- ultralytics/engine/tuner.py +4 -3
- ultralytics/engine/validator.py +3 -1
- ultralytics/hub/__init__.py +6 -2
- ultralytics/hub/auth.py +2 -2
- ultralytics/hub/google/__init__.py +9 -8
- ultralytics/hub/session.py +11 -11
- ultralytics/hub/utils.py +8 -9
- ultralytics/models/fastsam/model.py +8 -6
- ultralytics/models/nas/model.py +5 -3
- ultralytics/models/rtdetr/train.py +4 -3
- ultralytics/models/rtdetr/val.py +6 -4
- ultralytics/models/sam/amg.py +13 -10
- ultralytics/models/sam/model.py +3 -2
- ultralytics/models/sam/modules/blocks.py +21 -21
- ultralytics/models/sam/modules/decoders.py +11 -11
- ultralytics/models/sam/modules/encoders.py +25 -25
- ultralytics/models/sam/modules/memory_attention.py +9 -8
- ultralytics/models/sam/modules/sam.py +8 -10
- ultralytics/models/sam/modules/tiny_encoder.py +21 -20
- ultralytics/models/sam/modules/transformer.py +6 -5
- ultralytics/models/sam/modules/utils.py +7 -5
- ultralytics/models/sam/predict.py +32 -31
- ultralytics/models/utils/loss.py +29 -27
- ultralytics/models/utils/ops.py +10 -8
- ultralytics/models/yolo/classify/train.py +7 -5
- ultralytics/models/yolo/classify/val.py +10 -8
- ultralytics/models/yolo/detect/predict.py +3 -3
- ultralytics/models/yolo/detect/train.py +8 -6
- ultralytics/models/yolo/detect/val.py +23 -21
- ultralytics/models/yolo/model.py +14 -14
- ultralytics/models/yolo/obb/train.py +5 -3
- ultralytics/models/yolo/obb/val.py +13 -10
- ultralytics/models/yolo/pose/train.py +7 -5
- ultralytics/models/yolo/pose/val.py +11 -9
- ultralytics/models/yolo/segment/train.py +4 -5
- ultralytics/models/yolo/segment/val.py +12 -10
- ultralytics/models/yolo/world/train.py +9 -7
- ultralytics/models/yolo/yoloe/train.py +7 -6
- ultralytics/models/yolo/yoloe/val.py +10 -8
- ultralytics/nn/autobackend.py +40 -52
- ultralytics/nn/modules/__init__.py +3 -3
- ultralytics/nn/modules/block.py +12 -12
- ultralytics/nn/modules/conv.py +4 -3
- ultralytics/nn/modules/head.py +46 -38
- ultralytics/nn/modules/transformer.py +22 -21
- ultralytics/nn/tasks.py +2 -2
- ultralytics/nn/text_model.py +6 -5
- ultralytics/solutions/analytics.py +7 -5
- ultralytics/solutions/config.py +12 -10
- ultralytics/solutions/distance_calculation.py +3 -3
- ultralytics/solutions/heatmap.py +4 -2
- ultralytics/solutions/object_counter.py +5 -3
- ultralytics/solutions/parking_management.py +4 -2
- ultralytics/solutions/region_counter.py +7 -5
- ultralytics/solutions/similarity_search.py +5 -3
- ultralytics/solutions/solutions.py +38 -36
- ultralytics/solutions/streamlit_inference.py +8 -7
- ultralytics/trackers/bot_sort.py +11 -9
- ultralytics/trackers/byte_tracker.py +17 -15
- ultralytics/trackers/utils/gmc.py +4 -3
- ultralytics/utils/__init__.py +27 -77
- ultralytics/utils/autobatch.py +3 -2
- ultralytics/utils/autodevice.py +10 -10
- ultralytics/utils/benchmarks.py +11 -10
- ultralytics/utils/callbacks/comet.py +9 -9
- ultralytics/utils/callbacks/platform.py +2 -1
- ultralytics/utils/checks.py +20 -29
- ultralytics/utils/downloads.py +2 -2
- ultralytics/utils/export.py +12 -11
- ultralytics/utils/files.py +8 -7
- ultralytics/utils/git.py +139 -0
- ultralytics/utils/instance.py +8 -7
- ultralytics/utils/logger.py +7 -6
- ultralytics/utils/loss.py +15 -13
- ultralytics/utils/metrics.py +62 -62
- ultralytics/utils/nms.py +346 -0
- ultralytics/utils/ops.py +83 -251
- ultralytics/utils/patches.py +6 -4
- ultralytics/utils/plotting.py +18 -16
- ultralytics/utils/tal.py +1 -1
- ultralytics/utils/torch_utils.py +4 -2
- ultralytics/utils/tqdm.py +47 -33
- ultralytics/utils/triton.py +3 -2
- {dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
4
|
|
5
5
|
import torch
|
6
6
|
from torch import nn
|
@@ -43,7 +43,7 @@ class MaskDecoder(nn.Module):
|
|
43
43
|
transformer_dim: int,
|
44
44
|
transformer: nn.Module,
|
45
45
|
num_multimask_outputs: int = 3,
|
46
|
-
activation:
|
46
|
+
activation: type[nn.Module] = nn.GELU,
|
47
47
|
iou_head_depth: int = 3,
|
48
48
|
iou_head_hidden_dim: int = 256,
|
49
49
|
) -> None:
|
@@ -93,7 +93,7 @@ class MaskDecoder(nn.Module):
|
|
93
93
|
sparse_prompt_embeddings: torch.Tensor,
|
94
94
|
dense_prompt_embeddings: torch.Tensor,
|
95
95
|
multimask_output: bool,
|
96
|
-
) ->
|
96
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
97
97
|
"""
|
98
98
|
Predict masks given image and prompt embeddings.
|
99
99
|
|
@@ -137,7 +137,7 @@ class MaskDecoder(nn.Module):
|
|
137
137
|
image_pe: torch.Tensor,
|
138
138
|
sparse_prompt_embeddings: torch.Tensor,
|
139
139
|
dense_prompt_embeddings: torch.Tensor,
|
140
|
-
) ->
|
140
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
141
141
|
"""Predict masks and quality scores using image and prompt embeddings via transformer architecture."""
|
142
142
|
# Concatenate output tokens
|
143
143
|
output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
|
@@ -158,7 +158,7 @@ class MaskDecoder(nn.Module):
|
|
158
158
|
# Upscale mask embeddings and predict masks using the mask tokens
|
159
159
|
src = src.transpose(1, 2).view(b, c, h, w)
|
160
160
|
upscaled_embedding = self.output_upscaling(src)
|
161
|
-
hyper_in_list:
|
161
|
+
hyper_in_list: list[torch.Tensor] = [
|
162
162
|
self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
|
163
163
|
]
|
164
164
|
hyper_in = torch.stack(hyper_in_list, dim=1)
|
@@ -221,7 +221,7 @@ class SAM2MaskDecoder(nn.Module):
|
|
221
221
|
transformer_dim: int,
|
222
222
|
transformer: nn.Module,
|
223
223
|
num_multimask_outputs: int = 3,
|
224
|
-
activation:
|
224
|
+
activation: type[nn.Module] = nn.GELU,
|
225
225
|
iou_head_depth: int = 3,
|
226
226
|
iou_head_hidden_dim: int = 256,
|
227
227
|
use_high_res_features: bool = False,
|
@@ -317,8 +317,8 @@ class SAM2MaskDecoder(nn.Module):
|
|
317
317
|
dense_prompt_embeddings: torch.Tensor,
|
318
318
|
multimask_output: bool,
|
319
319
|
repeat_image: bool,
|
320
|
-
high_res_features:
|
321
|
-
) ->
|
320
|
+
high_res_features: list[torch.Tensor] | None = None,
|
321
|
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
322
322
|
"""
|
323
323
|
Predict masks given image and prompt embeddings.
|
324
324
|
|
@@ -385,8 +385,8 @@ class SAM2MaskDecoder(nn.Module):
|
|
385
385
|
sparse_prompt_embeddings: torch.Tensor,
|
386
386
|
dense_prompt_embeddings: torch.Tensor,
|
387
387
|
repeat_image: bool,
|
388
|
-
high_res_features:
|
389
|
-
) ->
|
388
|
+
high_res_features: list[torch.Tensor] | None = None,
|
389
|
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
390
390
|
"""Predict instance segmentation masks from image and prompt embeddings using a transformer."""
|
391
391
|
# Concatenate output tokens
|
392
392
|
s = 0
|
@@ -431,7 +431,7 @@ class SAM2MaskDecoder(nn.Module):
|
|
431
431
|
upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
|
432
432
|
upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
|
433
433
|
|
434
|
-
hyper_in_list:
|
434
|
+
hyper_in_list: list[torch.Tensor] = [
|
435
435
|
self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
|
436
436
|
]
|
437
437
|
hyper_in = torch.stack(hyper_in_list, dim=1)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
4
|
|
5
5
|
import torch
|
6
6
|
import torch.nn as nn
|
@@ -56,13 +56,13 @@ class ImageEncoderViT(nn.Module):
|
|
56
56
|
mlp_ratio: float = 4.0,
|
57
57
|
out_chans: int = 256,
|
58
58
|
qkv_bias: bool = True,
|
59
|
-
norm_layer:
|
60
|
-
act_layer:
|
59
|
+
norm_layer: type[nn.Module] = nn.LayerNorm,
|
60
|
+
act_layer: type[nn.Module] = nn.GELU,
|
61
61
|
use_abs_pos: bool = True,
|
62
62
|
use_rel_pos: bool = False,
|
63
63
|
rel_pos_zero_init: bool = True,
|
64
64
|
window_size: int = 0,
|
65
|
-
global_attn_indexes:
|
65
|
+
global_attn_indexes: tuple[int, ...] = (),
|
66
66
|
) -> None:
|
67
67
|
"""
|
68
68
|
Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
|
@@ -101,7 +101,7 @@ class ImageEncoderViT(nn.Module):
|
|
101
101
|
embed_dim=embed_dim,
|
102
102
|
)
|
103
103
|
|
104
|
-
self.pos_embed:
|
104
|
+
self.pos_embed: nn.Parameter | None = None
|
105
105
|
if use_abs_pos:
|
106
106
|
# Initialize absolute positional embedding with pretrain image size
|
107
107
|
self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
|
@@ -188,10 +188,10 @@ class PromptEncoder(nn.Module):
|
|
188
188
|
def __init__(
|
189
189
|
self,
|
190
190
|
embed_dim: int,
|
191
|
-
image_embedding_size:
|
192
|
-
input_image_size:
|
191
|
+
image_embedding_size: tuple[int, int],
|
192
|
+
input_image_size: tuple[int, int],
|
193
193
|
mask_in_chans: int,
|
194
|
-
activation:
|
194
|
+
activation: type[nn.Module] = nn.GELU,
|
195
195
|
) -> None:
|
196
196
|
"""
|
197
197
|
Initialize the PromptEncoder module for encoding various types of prompts.
|
@@ -286,9 +286,9 @@ class PromptEncoder(nn.Module):
|
|
286
286
|
|
287
287
|
@staticmethod
|
288
288
|
def _get_batch_size(
|
289
|
-
points:
|
290
|
-
boxes:
|
291
|
-
masks:
|
289
|
+
points: tuple[torch.Tensor, torch.Tensor] | None,
|
290
|
+
boxes: torch.Tensor | None,
|
291
|
+
masks: torch.Tensor | None,
|
292
292
|
) -> int:
|
293
293
|
"""Get the batch size of the output given the batch size of the input prompts."""
|
294
294
|
if points is not None:
|
@@ -302,10 +302,10 @@ class PromptEncoder(nn.Module):
|
|
302
302
|
|
303
303
|
def forward(
|
304
304
|
self,
|
305
|
-
points:
|
306
|
-
boxes:
|
307
|
-
masks:
|
308
|
-
) ->
|
305
|
+
points: tuple[torch.Tensor, torch.Tensor] | None,
|
306
|
+
boxes: torch.Tensor | None,
|
307
|
+
masks: torch.Tensor | None,
|
308
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
309
309
|
"""
|
310
310
|
Embed different types of prompts, returning both sparse and dense embeddings.
|
311
311
|
|
@@ -542,13 +542,13 @@ class FpnNeck(nn.Module):
|
|
542
542
|
def __init__(
|
543
543
|
self,
|
544
544
|
d_model: int,
|
545
|
-
backbone_channel_list:
|
545
|
+
backbone_channel_list: list[int],
|
546
546
|
kernel_size: int = 1,
|
547
547
|
stride: int = 1,
|
548
548
|
padding: int = 0,
|
549
549
|
fpn_interp_model: str = "bilinear",
|
550
550
|
fuse_type: str = "sum",
|
551
|
-
fpn_top_down_levels:
|
551
|
+
fpn_top_down_levels: list[int] | None = None,
|
552
552
|
):
|
553
553
|
"""
|
554
554
|
Initialize a modified Feature Pyramid Network (FPN) neck.
|
@@ -602,7 +602,7 @@ class FpnNeck(nn.Module):
|
|
602
602
|
fpn_top_down_levels = range(len(self.convs))
|
603
603
|
self.fpn_top_down_levels = list(fpn_top_down_levels)
|
604
604
|
|
605
|
-
def forward(self, xs:
|
605
|
+
def forward(self, xs: list[torch.Tensor]):
|
606
606
|
"""
|
607
607
|
Perform forward pass through the Feature Pyramid Network (FPN) neck.
|
608
608
|
|
@@ -695,20 +695,20 @@ class Hiera(nn.Module):
|
|
695
695
|
num_heads: int = 1, # initial number of heads
|
696
696
|
drop_path_rate: float = 0.0, # stochastic depth
|
697
697
|
q_pool: int = 3, # number of q_pool stages
|
698
|
-
q_stride:
|
699
|
-
stages:
|
698
|
+
q_stride: tuple[int, int] = (2, 2), # downsample stride bet. stages
|
699
|
+
stages: tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
|
700
700
|
dim_mul: float = 2.0, # dim_mul factor at stage shift
|
701
701
|
head_mul: float = 2.0, # head_mul factor at stage shift
|
702
|
-
window_pos_embed_bkg_spatial_size:
|
702
|
+
window_pos_embed_bkg_spatial_size: tuple[int, int] = (14, 14),
|
703
703
|
# window size per stage, when not using global att.
|
704
|
-
window_spec:
|
704
|
+
window_spec: tuple[int, ...] = (
|
705
705
|
8,
|
706
706
|
4,
|
707
707
|
14,
|
708
708
|
7,
|
709
709
|
),
|
710
710
|
# global attn in these blocks
|
711
|
-
global_att_blocks:
|
711
|
+
global_att_blocks: tuple[int, ...] = (
|
712
712
|
12,
|
713
713
|
16,
|
714
714
|
20,
|
@@ -806,7 +806,7 @@ class Hiera(nn.Module):
|
|
806
806
|
else [self.blocks[-1].dim_out]
|
807
807
|
)
|
808
808
|
|
809
|
-
def _get_pos_embed(self, hw:
|
809
|
+
def _get_pos_embed(self, hw: tuple[int, int]) -> torch.Tensor:
|
810
810
|
"""Generate positional embeddings by interpolating and combining window and background embeddings."""
|
811
811
|
h, w = hw
|
812
812
|
window_embed = self.pos_embed_window
|
@@ -815,7 +815,7 @@ class Hiera(nn.Module):
|
|
815
815
|
pos_embed = pos_embed.permute(0, 2, 3, 1)
|
816
816
|
return pos_embed
|
817
817
|
|
818
|
-
def forward(self, x: torch.Tensor) ->
|
818
|
+
def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
|
819
819
|
"""
|
820
820
|
Perform forward pass through Hiera model, extracting multiscale features from input images.
|
821
821
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
import copy
|
4
|
-
from typing import Optional
|
5
6
|
|
6
7
|
import torch
|
7
8
|
from torch import nn
|
@@ -103,7 +104,7 @@ class MemoryAttentionLayer(nn.Module):
|
|
103
104
|
self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
|
104
105
|
self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
|
105
106
|
|
106
|
-
def _forward_sa(self, tgt: torch.Tensor, query_pos:
|
107
|
+
def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
|
107
108
|
"""Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
|
108
109
|
tgt2 = self.norm1(tgt)
|
109
110
|
q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
|
@@ -115,8 +116,8 @@ class MemoryAttentionLayer(nn.Module):
|
|
115
116
|
self,
|
116
117
|
tgt: torch.Tensor,
|
117
118
|
memory: torch.Tensor,
|
118
|
-
query_pos:
|
119
|
-
pos:
|
119
|
+
query_pos: torch.Tensor | None,
|
120
|
+
pos: torch.Tensor | None,
|
120
121
|
num_k_exclude_rope: int = 0,
|
121
122
|
) -> torch.Tensor:
|
122
123
|
"""Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
|
@@ -140,8 +141,8 @@ class MemoryAttentionLayer(nn.Module):
|
|
140
141
|
self,
|
141
142
|
tgt: torch.Tensor,
|
142
143
|
memory: torch.Tensor,
|
143
|
-
pos:
|
144
|
-
query_pos:
|
144
|
+
pos: torch.Tensor | None = None,
|
145
|
+
query_pos: torch.Tensor | None = None,
|
145
146
|
num_k_exclude_rope: int = 0,
|
146
147
|
) -> torch.Tensor:
|
147
148
|
"""
|
@@ -242,8 +243,8 @@ class MemoryAttention(nn.Module):
|
|
242
243
|
self,
|
243
244
|
curr: torch.Tensor, # self-attention inputs
|
244
245
|
memory: torch.Tensor, # cross-attention inputs
|
245
|
-
curr_pos:
|
246
|
-
memory_pos:
|
246
|
+
curr_pos: torch.Tensor | None = None, # pos_enc for self-attention inputs
|
247
|
+
memory_pos: torch.Tensor | None = None, # pos_enc for cross-attention inputs
|
247
248
|
num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
|
248
249
|
) -> torch.Tensor:
|
249
250
|
"""
|
@@ -3,10 +3,7 @@
|
|
3
3
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
4
4
|
# All rights reserved.
|
5
5
|
|
6
|
-
|
7
|
-
# LICENSE file in the root directory of this source tree.
|
8
|
-
|
9
|
-
from typing import List
|
6
|
+
from __future__ import annotations
|
10
7
|
|
11
8
|
import torch
|
12
9
|
import torch.nn.functional as F
|
@@ -61,8 +58,8 @@ class SAMModel(nn.Module):
|
|
61
58
|
image_encoder: ImageEncoderViT,
|
62
59
|
prompt_encoder: PromptEncoder,
|
63
60
|
mask_decoder: MaskDecoder,
|
64
|
-
pixel_mean:
|
65
|
-
pixel_std:
|
61
|
+
pixel_mean: list[float] = (123.675, 116.28, 103.53),
|
62
|
+
pixel_std: list[float] = (58.395, 57.12, 57.375),
|
66
63
|
) -> None:
|
67
64
|
"""
|
68
65
|
Initialize the SAMModel class to predict object masks from an image and input prompts.
|
@@ -959,7 +956,6 @@ class SAM2Model(torch.nn.Module):
|
|
959
956
|
prev_sam_mask_logits=None,
|
960
957
|
):
|
961
958
|
"""Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
|
962
|
-
current_out = {}
|
963
959
|
sam_outputs, _, _ = self._track_step(
|
964
960
|
frame_idx,
|
965
961
|
is_init_cond_frame,
|
@@ -975,9 +971,11 @@ class SAM2Model(torch.nn.Module):
|
|
975
971
|
)
|
976
972
|
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
|
977
973
|
|
978
|
-
current_out
|
979
|
-
|
980
|
-
|
974
|
+
current_out = {
|
975
|
+
"pred_masks": low_res_masks,
|
976
|
+
"pred_masks_high_res": high_res_masks,
|
977
|
+
"obj_ptr": obj_ptr,
|
978
|
+
}
|
981
979
|
if not self.training:
|
982
980
|
# Only add this in inference (to avoid unused param in activation checkpointing;
|
983
981
|
# it's mainly used in the demo to encode spatial memories w/ consolidated masks)
|
@@ -9,8 +9,9 @@
|
|
9
9
|
# Build the TinyViT Model
|
10
10
|
# --------------------------------------------------------
|
11
11
|
|
12
|
+
from __future__ import annotations
|
13
|
+
|
12
14
|
import itertools
|
13
|
-
from typing import List, Optional, Tuple, Union
|
14
15
|
|
15
16
|
import torch
|
16
17
|
import torch.nn as nn
|
@@ -106,7 +107,7 @@ class PatchEmbed(nn.Module):
|
|
106
107
|
activation (nn.Module): Activation function to use between convolutions.
|
107
108
|
"""
|
108
109
|
super().__init__()
|
109
|
-
img_size:
|
110
|
+
img_size: tuple[int, int] = to_2tuple(resolution)
|
110
111
|
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
|
111
112
|
self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
|
112
113
|
self.in_chans = in_chans
|
@@ -219,7 +220,7 @@ class PatchMerging(nn.Module):
|
|
219
220
|
torch.Size([4, 3136, 128])
|
220
221
|
"""
|
221
222
|
|
222
|
-
def __init__(self, input_resolution:
|
223
|
+
def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
|
223
224
|
"""
|
224
225
|
Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
|
225
226
|
|
@@ -283,13 +284,13 @@ class ConvLayer(nn.Module):
|
|
283
284
|
def __init__(
|
284
285
|
self,
|
285
286
|
dim: int,
|
286
|
-
input_resolution:
|
287
|
+
input_resolution: tuple[int, int],
|
287
288
|
depth: int,
|
288
289
|
activation,
|
289
|
-
drop_path:
|
290
|
-
downsample:
|
290
|
+
drop_path: float | list[float] = 0.0,
|
291
|
+
downsample: nn.Module | None = None,
|
291
292
|
use_checkpoint: bool = False,
|
292
|
-
out_dim:
|
293
|
+
out_dim: int | None = None,
|
293
294
|
conv_expand_ratio: float = 4.0,
|
294
295
|
):
|
295
296
|
"""
|
@@ -370,8 +371,8 @@ class MLP(nn.Module):
|
|
370
371
|
def __init__(
|
371
372
|
self,
|
372
373
|
in_features: int,
|
373
|
-
hidden_features:
|
374
|
-
out_features:
|
374
|
+
hidden_features: int | None = None,
|
375
|
+
out_features: int | None = None,
|
375
376
|
activation=nn.GELU,
|
376
377
|
drop: float = 0.0,
|
377
378
|
):
|
@@ -441,7 +442,7 @@ class Attention(torch.nn.Module):
|
|
441
442
|
key_dim: int,
|
442
443
|
num_heads: int = 8,
|
443
444
|
attn_ratio: float = 4,
|
444
|
-
resolution:
|
445
|
+
resolution: tuple[int, int] = (14, 14),
|
445
446
|
):
|
446
447
|
"""
|
447
448
|
Initialize the Attention module for multi-head attention with spatial awareness.
|
@@ -549,7 +550,7 @@ class TinyViTBlock(nn.Module):
|
|
549
550
|
def __init__(
|
550
551
|
self,
|
551
552
|
dim: int,
|
552
|
-
input_resolution:
|
553
|
+
input_resolution: tuple[int, int],
|
553
554
|
num_heads: int,
|
554
555
|
window_size: int = 7,
|
555
556
|
mlp_ratio: float = 4.0,
|
@@ -690,18 +691,18 @@ class BasicLayer(nn.Module):
|
|
690
691
|
def __init__(
|
691
692
|
self,
|
692
693
|
dim: int,
|
693
|
-
input_resolution:
|
694
|
+
input_resolution: tuple[int, int],
|
694
695
|
depth: int,
|
695
696
|
num_heads: int,
|
696
697
|
window_size: int,
|
697
698
|
mlp_ratio: float = 4.0,
|
698
699
|
drop: float = 0.0,
|
699
|
-
drop_path:
|
700
|
-
downsample:
|
700
|
+
drop_path: float | list[float] = 0.0,
|
701
|
+
downsample: nn.Module | None = None,
|
701
702
|
use_checkpoint: bool = False,
|
702
703
|
local_conv_size: int = 3,
|
703
704
|
activation=nn.GELU,
|
704
|
-
out_dim:
|
705
|
+
out_dim: int | None = None,
|
705
706
|
):
|
706
707
|
"""
|
707
708
|
Initialize a BasicLayer in the TinyViT architecture.
|
@@ -800,10 +801,10 @@ class TinyViT(nn.Module):
|
|
800
801
|
img_size: int = 224,
|
801
802
|
in_chans: int = 3,
|
802
803
|
num_classes: int = 1000,
|
803
|
-
embed_dims:
|
804
|
-
depths:
|
805
|
-
num_heads:
|
806
|
-
window_sizes:
|
804
|
+
embed_dims: tuple[int, int, int, int] = (96, 192, 384, 768),
|
805
|
+
depths: tuple[int, int, int, int] = (2, 2, 6, 2),
|
806
|
+
num_heads: tuple[int, int, int, int] = (3, 6, 12, 24),
|
807
|
+
window_sizes: tuple[int, int, int, int] = (7, 7, 14, 7),
|
807
808
|
mlp_ratio: float = 4.0,
|
808
809
|
drop_rate: float = 0.0,
|
809
810
|
drop_path_rate: float = 0.1,
|
@@ -980,7 +981,7 @@ class TinyViT(nn.Module):
|
|
980
981
|
"""Perform the forward pass through the TinyViT model, extracting features from the input image."""
|
981
982
|
return self.forward_features(x)
|
982
983
|
|
983
|
-
def set_imgsz(self, imgsz:
|
984
|
+
def set_imgsz(self, imgsz: list[int] = [1024, 1024]):
|
984
985
|
"""Set image size to make model compatible with different image sizes."""
|
985
986
|
imgsz = [s // 4 for s in imgsz]
|
986
987
|
self.patches_resolution = imgsz
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
import math
|
4
|
-
from typing import Tuple, Type
|
5
6
|
|
6
7
|
import torch
|
7
8
|
from torch import Tensor, nn
|
@@ -44,7 +45,7 @@ class TwoWayTransformer(nn.Module):
|
|
44
45
|
embedding_dim: int,
|
45
46
|
num_heads: int,
|
46
47
|
mlp_dim: int,
|
47
|
-
activation:
|
48
|
+
activation: type[nn.Module] = nn.ReLU,
|
48
49
|
attention_downsample_rate: int = 2,
|
49
50
|
) -> None:
|
50
51
|
"""
|
@@ -85,7 +86,7 @@ class TwoWayTransformer(nn.Module):
|
|
85
86
|
image_embedding: torch.Tensor,
|
86
87
|
image_pe: torch.Tensor,
|
87
88
|
point_embedding: torch.Tensor,
|
88
|
-
) ->
|
89
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
89
90
|
"""
|
90
91
|
Process image and point embeddings through the Two-Way Transformer.
|
91
92
|
|
@@ -162,7 +163,7 @@ class TwoWayAttentionBlock(nn.Module):
|
|
162
163
|
embedding_dim: int,
|
163
164
|
num_heads: int,
|
164
165
|
mlp_dim: int = 2048,
|
165
|
-
activation:
|
166
|
+
activation: type[nn.Module] = nn.ReLU,
|
166
167
|
attention_downsample_rate: int = 2,
|
167
168
|
skip_first_layer_pe: bool = False,
|
168
169
|
) -> None:
|
@@ -198,7 +199,7 @@ class TwoWayAttentionBlock(nn.Module):
|
|
198
199
|
|
199
200
|
def forward(
|
200
201
|
self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
|
201
|
-
) ->
|
202
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
202
203
|
"""
|
203
204
|
Apply two-way attention to process query and key embeddings in a transformer block.
|
204
205
|
|
@@ -1,12 +1,14 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Any
|
4
6
|
|
5
7
|
import torch
|
6
8
|
import torch.nn.functional as F
|
7
9
|
|
8
10
|
|
9
|
-
def select_closest_cond_frames(frame_idx: int, cond_frame_outputs:
|
11
|
+
def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any], max_cond_frame_num: int):
|
10
12
|
"""
|
11
13
|
Select the closest conditioning frames to a given frame index.
|
12
14
|
|
@@ -248,7 +250,7 @@ def window_partition(x: torch.Tensor, window_size: int):
|
|
248
250
|
return windows, (Hp, Wp)
|
249
251
|
|
250
252
|
|
251
|
-
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw:
|
253
|
+
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[int, int], hw: tuple[int, int]):
|
252
254
|
"""
|
253
255
|
Unpartition windowed sequences into original sequences and remove padding.
|
254
256
|
|
@@ -333,8 +335,8 @@ def add_decomposed_rel_pos(
|
|
333
335
|
q: torch.Tensor,
|
334
336
|
rel_pos_h: torch.Tensor,
|
335
337
|
rel_pos_w: torch.Tensor,
|
336
|
-
q_size:
|
337
|
-
k_size:
|
338
|
+
q_size: tuple[int, int],
|
339
|
+
k_size: tuple[int, int],
|
338
340
|
) -> torch.Tensor:
|
339
341
|
"""
|
340
342
|
Add decomposed Relative Positional Embeddings to the attention map.
|
@@ -8,8 +8,10 @@ using SAM. It forms an integral part of the Ultralytics framework and is designe
|
|
8
8
|
segmentation tasks.
|
9
9
|
"""
|
10
10
|
|
11
|
+
from __future__ import annotations
|
12
|
+
|
11
13
|
from collections import OrderedDict
|
12
|
-
from typing import Any
|
14
|
+
from typing import Any
|
13
15
|
|
14
16
|
import cv2
|
15
17
|
import numpy as np
|
@@ -1717,9 +1719,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1717
1719
|
def __init__(
|
1718
1720
|
self,
|
1719
1721
|
cfg: Any = DEFAULT_CFG,
|
1720
|
-
overrides:
|
1722
|
+
overrides: dict[str, Any] | None = None,
|
1721
1723
|
max_obj_num: int = 3,
|
1722
|
-
_callbacks:
|
1724
|
+
_callbacks: dict[str, Any] | None = None,
|
1723
1725
|
) -> None:
|
1724
1726
|
"""
|
1725
1727
|
Initialize the predictor with configuration and optional overrides.
|
@@ -1759,14 +1761,14 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1759
1761
|
@smart_inference_mode()
|
1760
1762
|
def inference(
|
1761
1763
|
self,
|
1762
|
-
img:
|
1763
|
-
bboxes:
|
1764
|
-
masks:
|
1765
|
-
points:
|
1766
|
-
labels:
|
1767
|
-
obj_ids:
|
1764
|
+
img: torch.Tensor | np.ndarray,
|
1765
|
+
bboxes: list[list[float]] | None = None,
|
1766
|
+
masks: torch.Tensor | np.ndarray | None = None,
|
1767
|
+
points: list[list[float]] | None = None,
|
1768
|
+
labels: list[int] | None = None,
|
1769
|
+
obj_ids: list[int] | None = None,
|
1768
1770
|
update_memory: bool = False,
|
1769
|
-
) ->
|
1771
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
1770
1772
|
"""
|
1771
1773
|
Perform inference on a single image with optional bounding boxes, masks, points and object IDs.
|
1772
1774
|
It has two modes: one is to run inference on a single image without updating the memory,
|
@@ -1824,7 +1826,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1824
1826
|
pred_scores = torch.clamp_(pred_scores / 32, min=0)
|
1825
1827
|
return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
|
1826
1828
|
|
1827
|
-
def get_im_features(self, img:
|
1829
|
+
def get_im_features(self, img: torch.Tensor | np.ndarray) -> None:
|
1828
1830
|
"""
|
1829
1831
|
Initialize the image state by processing the input image and extracting features.
|
1830
1832
|
|
@@ -1844,10 +1846,10 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1844
1846
|
@smart_inference_mode()
|
1845
1847
|
def update_memory(
|
1846
1848
|
self,
|
1847
|
-
obj_ids:
|
1848
|
-
points:
|
1849
|
-
labels:
|
1850
|
-
masks:
|
1849
|
+
obj_ids: list[int] = None,
|
1850
|
+
points: torch.Tensor | None = None,
|
1851
|
+
labels: torch.Tensor | None = None,
|
1852
|
+
masks: torch.Tensor | None = None,
|
1851
1853
|
) -> None:
|
1852
1854
|
"""
|
1853
1855
|
Append the imgState to the memory_bank and update the memory for the model.
|
@@ -1923,7 +1925,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1923
1925
|
consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
|
1924
1926
|
self.memory_bank.append(consolidated_out)
|
1925
1927
|
|
1926
|
-
def _prepare_memory_conditioned_features(self, obj_idx:
|
1928
|
+
def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
|
1927
1929
|
"""
|
1928
1930
|
Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
|
1929
1931
|
prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features for all
|
@@ -1958,7 +1960,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1958
1960
|
*self.feat_sizes[-1],
|
1959
1961
|
)
|
1960
1962
|
|
1961
|
-
def get_maskmem_enc(self) ->
|
1963
|
+
def get_maskmem_enc(self) -> tuple[torch.Tensor, torch.Tensor]:
|
1962
1964
|
"""Get the memory and positional encoding from the memory, which is used to condition the current image
|
1963
1965
|
features.
|
1964
1966
|
"""
|
@@ -1973,7 +1975,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1973
1975
|
memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
|
1974
1976
|
return memory, memory_pos_embed
|
1975
1977
|
|
1976
|
-
def _obj_id_to_idx(self, obj_id: int) ->
|
1978
|
+
def _obj_id_to_idx(self, obj_id: int) -> int | None:
|
1977
1979
|
"""
|
1978
1980
|
Map client-side object id to model-side object index.
|
1979
1981
|
|
@@ -1987,11 +1989,11 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1987
1989
|
|
1988
1990
|
def track_step(
|
1989
1991
|
self,
|
1990
|
-
obj_idx:
|
1991
|
-
point:
|
1992
|
-
label:
|
1993
|
-
mask:
|
1994
|
-
) ->
|
1992
|
+
obj_idx: int | None = None,
|
1993
|
+
point: torch.Tensor | None = None,
|
1994
|
+
label: torch.Tensor | None = None,
|
1995
|
+
mask: torch.Tensor | None = None,
|
1996
|
+
) -> dict[str, Any]:
|
1995
1997
|
"""
|
1996
1998
|
Tracking step for the current image state to predict masks.
|
1997
1999
|
|
@@ -2010,7 +2012,6 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2010
2012
|
current_out (Dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
|
2011
2013
|
Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
|
2012
2014
|
"""
|
2013
|
-
current_out = {}
|
2014
2015
|
if mask is not None and self.model.use_mask_input_as_output_without_sam:
|
2015
2016
|
# When use_mask_input_as_output_without_sam=True, we directly output the mask input
|
2016
2017
|
# (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
|
@@ -2021,7 +2022,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2021
2022
|
# fused the visual feature with previous memory features in the memory bank
|
2022
2023
|
pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
|
2023
2024
|
# calculate the first feature if adding obj_idx exists(means adding prompts)
|
2024
|
-
pix_feat_with_mem = pix_feat_with_mem[
|
2025
|
+
pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
|
2025
2026
|
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
|
2026
2027
|
backbone_features=pix_feat_with_mem,
|
2027
2028
|
point_inputs={"point_coords": point, "point_labels": label} if obj_idx is not None else None,
|
@@ -2029,9 +2030,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2029
2030
|
multimask_output=False,
|
2030
2031
|
high_res_features=[feat[: pix_feat_with_mem.size(0)] for feat in self.high_res_features],
|
2031
2032
|
)
|
2032
|
-
|
2033
|
-
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2037
|
-
|
2033
|
+
return {
|
2034
|
+
"pred_masks": low_res_masks,
|
2035
|
+
"pred_masks_high_res": high_res_masks,
|
2036
|
+
"obj_ptr": obj_ptr,
|
2037
|
+
"object_score_logits": object_score_logits,
|
2038
|
+
}
|