frontveg 0.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- frontveg/__init__.py +11 -0
- frontveg/_tests/__init__.py +0 -0
- frontveg/_tests/test_widget.py +66 -0
- frontveg/_version.py +21 -0
- frontveg/_widget.py +132 -0
- frontveg/napari.yaml +14 -0
- frontveg/utils.py +95 -0
- frontveg-0.1.dev1.dist-info/METADATA +143 -0
- frontveg-0.1.dev1.dist-info/RECORD +44 -0
- frontveg-0.1.dev1.dist-info/WHEEL +5 -0
- frontveg-0.1.dev1.dist-info/entry_points.txt +2 -0
- frontveg-0.1.dev1.dist-info/licenses/LICENSE +28 -0
- frontveg-0.1.dev1.dist-info/top_level.txt +2 -0
- sam2/__init__.py +11 -0
- sam2/automatic_mask_generator.py +454 -0
- sam2/build_sam.py +167 -0
- sam2/configs/sam2/sam2_hiera_b+.yaml +113 -0
- sam2/configs/sam2/sam2_hiera_l.yaml +117 -0
- sam2/configs/sam2/sam2_hiera_s.yaml +116 -0
- sam2/configs/sam2/sam2_hiera_t.yaml +118 -0
- sam2/modeling/__init__.py +5 -0
- sam2/modeling/backbones/__init__.py +5 -0
- sam2/modeling/backbones/hieradet.py +317 -0
- sam2/modeling/backbones/image_encoder.py +134 -0
- sam2/modeling/backbones/utils.py +95 -0
- sam2/modeling/memory_attention.py +169 -0
- sam2/modeling/memory_encoder.py +181 -0
- sam2/modeling/position_encoding.py +221 -0
- sam2/modeling/sam/__init__.py +5 -0
- sam2/modeling/sam/mask_decoder.py +295 -0
- sam2/modeling/sam/prompt_encoder.py +182 -0
- sam2/modeling/sam/transformer.py +360 -0
- sam2/modeling/sam2_base.py +907 -0
- sam2/modeling/sam2_utils.py +323 -0
- sam2/sam2_hiera_b+.yaml +1 -0
- sam2/sam2_hiera_l.yaml +1 -0
- sam2/sam2_hiera_s.yaml +1 -0
- sam2/sam2_hiera_t.yaml +1 -0
- sam2/sam2_image_predictor.py +466 -0
- sam2/sam2_video_predictor.py +1172 -0
- sam2/utils/__init__.py +5 -0
- sam2/utils/amg.py +348 -0
- sam2/utils/misc.py +349 -0
- sam2/utils/transforms.py +118 -0
@@ -0,0 +1,113 @@
|
|
1
|
+
# @package _global_
|
2
|
+
|
3
|
+
# Model
|
4
|
+
model:
|
5
|
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6
|
+
image_encoder:
|
7
|
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8
|
+
scalp: 1
|
9
|
+
trunk:
|
10
|
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11
|
+
embed_dim: 112
|
12
|
+
num_heads: 2
|
13
|
+
neck:
|
14
|
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
15
|
+
position_encoding:
|
16
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
17
|
+
num_pos_feats: 256
|
18
|
+
normalize: true
|
19
|
+
scale: null
|
20
|
+
temperature: 10000
|
21
|
+
d_model: 256
|
22
|
+
backbone_channel_list: [896, 448, 224, 112]
|
23
|
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
24
|
+
fpn_interp_model: nearest
|
25
|
+
|
26
|
+
memory_attention:
|
27
|
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
28
|
+
d_model: 256
|
29
|
+
pos_enc_at_input: true
|
30
|
+
layer:
|
31
|
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
32
|
+
activation: relu
|
33
|
+
dim_feedforward: 2048
|
34
|
+
dropout: 0.1
|
35
|
+
pos_enc_at_attn: false
|
36
|
+
self_attention:
|
37
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
38
|
+
rope_theta: 10000.0
|
39
|
+
feat_sizes: [32, 32]
|
40
|
+
embedding_dim: 256
|
41
|
+
num_heads: 1
|
42
|
+
downsample_rate: 1
|
43
|
+
dropout: 0.1
|
44
|
+
d_model: 256
|
45
|
+
pos_enc_at_cross_attn_keys: true
|
46
|
+
pos_enc_at_cross_attn_queries: false
|
47
|
+
cross_attention:
|
48
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
49
|
+
rope_theta: 10000.0
|
50
|
+
feat_sizes: [32, 32]
|
51
|
+
rope_k_repeat: True
|
52
|
+
embedding_dim: 256
|
53
|
+
num_heads: 1
|
54
|
+
downsample_rate: 1
|
55
|
+
dropout: 0.1
|
56
|
+
kv_in_dim: 64
|
57
|
+
num_layers: 4
|
58
|
+
|
59
|
+
memory_encoder:
|
60
|
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
61
|
+
out_dim: 64
|
62
|
+
position_encoding:
|
63
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
64
|
+
num_pos_feats: 64
|
65
|
+
normalize: true
|
66
|
+
scale: null
|
67
|
+
temperature: 10000
|
68
|
+
mask_downsampler:
|
69
|
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
70
|
+
kernel_size: 3
|
71
|
+
stride: 2
|
72
|
+
padding: 1
|
73
|
+
fuser:
|
74
|
+
_target_: sam2.modeling.memory_encoder.Fuser
|
75
|
+
layer:
|
76
|
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
77
|
+
dim: 256
|
78
|
+
kernel_size: 7
|
79
|
+
padding: 3
|
80
|
+
layer_scale_init_value: 1e-6
|
81
|
+
use_dwconv: True # depth-wise convs
|
82
|
+
num_layers: 2
|
83
|
+
|
84
|
+
num_maskmem: 7
|
85
|
+
image_size: 1024
|
86
|
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
87
|
+
sigmoid_scale_for_mem_enc: 20.0
|
88
|
+
sigmoid_bias_for_mem_enc: -10.0
|
89
|
+
use_mask_input_as_output_without_sam: true
|
90
|
+
# Memory
|
91
|
+
directly_add_no_mem_embed: true
|
92
|
+
# use high-resolution feature map in the SAM mask decoder
|
93
|
+
use_high_res_features_in_sam: true
|
94
|
+
# output 3 masks on the first click on initial conditioning frames
|
95
|
+
multimask_output_in_sam: true
|
96
|
+
# SAM heads
|
97
|
+
iou_prediction_use_sigmoid: True
|
98
|
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
99
|
+
use_obj_ptrs_in_encoder: true
|
100
|
+
add_tpos_enc_to_obj_ptrs: false
|
101
|
+
only_obj_ptrs_in_the_past_for_eval: true
|
102
|
+
# object occlusion prediction
|
103
|
+
pred_obj_scores: true
|
104
|
+
pred_obj_scores_mlp: true
|
105
|
+
fixed_no_obj_ptr: true
|
106
|
+
# multimask tracking settings
|
107
|
+
multimask_output_for_tracking: true
|
108
|
+
use_multimask_token_for_obj_ptr: true
|
109
|
+
multimask_min_pt_num: 0
|
110
|
+
multimask_max_pt_num: 1
|
111
|
+
use_mlp_for_obj_ptr_proj: true
|
112
|
+
# Compilation flag
|
113
|
+
compile_image_encoder: False
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# @package _global_
|
2
|
+
|
3
|
+
# Model
|
4
|
+
model:
|
5
|
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6
|
+
image_encoder:
|
7
|
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8
|
+
scalp: 1
|
9
|
+
trunk:
|
10
|
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11
|
+
embed_dim: 144
|
12
|
+
num_heads: 2
|
13
|
+
stages: [2, 6, 36, 4]
|
14
|
+
global_att_blocks: [23, 33, 43]
|
15
|
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16
|
+
window_spec: [8, 4, 16, 8]
|
17
|
+
neck:
|
18
|
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
19
|
+
position_encoding:
|
20
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
21
|
+
num_pos_feats: 256
|
22
|
+
normalize: true
|
23
|
+
scale: null
|
24
|
+
temperature: 10000
|
25
|
+
d_model: 256
|
26
|
+
backbone_channel_list: [1152, 576, 288, 144]
|
27
|
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
28
|
+
fpn_interp_model: nearest
|
29
|
+
|
30
|
+
memory_attention:
|
31
|
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
32
|
+
d_model: 256
|
33
|
+
pos_enc_at_input: true
|
34
|
+
layer:
|
35
|
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
36
|
+
activation: relu
|
37
|
+
dim_feedforward: 2048
|
38
|
+
dropout: 0.1
|
39
|
+
pos_enc_at_attn: false
|
40
|
+
self_attention:
|
41
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
42
|
+
rope_theta: 10000.0
|
43
|
+
feat_sizes: [32, 32]
|
44
|
+
embedding_dim: 256
|
45
|
+
num_heads: 1
|
46
|
+
downsample_rate: 1
|
47
|
+
dropout: 0.1
|
48
|
+
d_model: 256
|
49
|
+
pos_enc_at_cross_attn_keys: true
|
50
|
+
pos_enc_at_cross_attn_queries: false
|
51
|
+
cross_attention:
|
52
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
53
|
+
rope_theta: 10000.0
|
54
|
+
feat_sizes: [32, 32]
|
55
|
+
rope_k_repeat: True
|
56
|
+
embedding_dim: 256
|
57
|
+
num_heads: 1
|
58
|
+
downsample_rate: 1
|
59
|
+
dropout: 0.1
|
60
|
+
kv_in_dim: 64
|
61
|
+
num_layers: 4
|
62
|
+
|
63
|
+
memory_encoder:
|
64
|
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
65
|
+
out_dim: 64
|
66
|
+
position_encoding:
|
67
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
68
|
+
num_pos_feats: 64
|
69
|
+
normalize: true
|
70
|
+
scale: null
|
71
|
+
temperature: 10000
|
72
|
+
mask_downsampler:
|
73
|
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
74
|
+
kernel_size: 3
|
75
|
+
stride: 2
|
76
|
+
padding: 1
|
77
|
+
fuser:
|
78
|
+
_target_: sam2.modeling.memory_encoder.Fuser
|
79
|
+
layer:
|
80
|
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
81
|
+
dim: 256
|
82
|
+
kernel_size: 7
|
83
|
+
padding: 3
|
84
|
+
layer_scale_init_value: 1e-6
|
85
|
+
use_dwconv: True # depth-wise convs
|
86
|
+
num_layers: 2
|
87
|
+
|
88
|
+
num_maskmem: 7
|
89
|
+
image_size: 1024
|
90
|
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
91
|
+
sigmoid_scale_for_mem_enc: 20.0
|
92
|
+
sigmoid_bias_for_mem_enc: -10.0
|
93
|
+
use_mask_input_as_output_without_sam: true
|
94
|
+
# Memory
|
95
|
+
directly_add_no_mem_embed: true
|
96
|
+
# use high-resolution feature map in the SAM mask decoder
|
97
|
+
use_high_res_features_in_sam: true
|
98
|
+
# output 3 masks on the first click on initial conditioning frames
|
99
|
+
multimask_output_in_sam: true
|
100
|
+
# SAM heads
|
101
|
+
iou_prediction_use_sigmoid: True
|
102
|
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103
|
+
use_obj_ptrs_in_encoder: true
|
104
|
+
add_tpos_enc_to_obj_ptrs: false
|
105
|
+
only_obj_ptrs_in_the_past_for_eval: true
|
106
|
+
# object occlusion prediction
|
107
|
+
pred_obj_scores: true
|
108
|
+
pred_obj_scores_mlp: true
|
109
|
+
fixed_no_obj_ptr: true
|
110
|
+
# multimask tracking settings
|
111
|
+
multimask_output_for_tracking: true
|
112
|
+
use_multimask_token_for_obj_ptr: true
|
113
|
+
multimask_min_pt_num: 0
|
114
|
+
multimask_max_pt_num: 1
|
115
|
+
use_mlp_for_obj_ptr_proj: true
|
116
|
+
# Compilation flag
|
117
|
+
compile_image_encoder: False
|
@@ -0,0 +1,116 @@
|
|
1
|
+
# @package _global_
|
2
|
+
|
3
|
+
# Model
|
4
|
+
model:
|
5
|
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6
|
+
image_encoder:
|
7
|
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8
|
+
scalp: 1
|
9
|
+
trunk:
|
10
|
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11
|
+
embed_dim: 96
|
12
|
+
num_heads: 1
|
13
|
+
stages: [1, 2, 11, 2]
|
14
|
+
global_att_blocks: [7, 10, 13]
|
15
|
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16
|
+
neck:
|
17
|
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18
|
+
position_encoding:
|
19
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20
|
+
num_pos_feats: 256
|
21
|
+
normalize: true
|
22
|
+
scale: null
|
23
|
+
temperature: 10000
|
24
|
+
d_model: 256
|
25
|
+
backbone_channel_list: [768, 384, 192, 96]
|
26
|
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27
|
+
fpn_interp_model: nearest
|
28
|
+
|
29
|
+
memory_attention:
|
30
|
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31
|
+
d_model: 256
|
32
|
+
pos_enc_at_input: true
|
33
|
+
layer:
|
34
|
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35
|
+
activation: relu
|
36
|
+
dim_feedforward: 2048
|
37
|
+
dropout: 0.1
|
38
|
+
pos_enc_at_attn: false
|
39
|
+
self_attention:
|
40
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41
|
+
rope_theta: 10000.0
|
42
|
+
feat_sizes: [32, 32]
|
43
|
+
embedding_dim: 256
|
44
|
+
num_heads: 1
|
45
|
+
downsample_rate: 1
|
46
|
+
dropout: 0.1
|
47
|
+
d_model: 256
|
48
|
+
pos_enc_at_cross_attn_keys: true
|
49
|
+
pos_enc_at_cross_attn_queries: false
|
50
|
+
cross_attention:
|
51
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52
|
+
rope_theta: 10000.0
|
53
|
+
feat_sizes: [32, 32]
|
54
|
+
rope_k_repeat: True
|
55
|
+
embedding_dim: 256
|
56
|
+
num_heads: 1
|
57
|
+
downsample_rate: 1
|
58
|
+
dropout: 0.1
|
59
|
+
kv_in_dim: 64
|
60
|
+
num_layers: 4
|
61
|
+
|
62
|
+
memory_encoder:
|
63
|
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64
|
+
out_dim: 64
|
65
|
+
position_encoding:
|
66
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67
|
+
num_pos_feats: 64
|
68
|
+
normalize: true
|
69
|
+
scale: null
|
70
|
+
temperature: 10000
|
71
|
+
mask_downsampler:
|
72
|
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73
|
+
kernel_size: 3
|
74
|
+
stride: 2
|
75
|
+
padding: 1
|
76
|
+
fuser:
|
77
|
+
_target_: sam2.modeling.memory_encoder.Fuser
|
78
|
+
layer:
|
79
|
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
80
|
+
dim: 256
|
81
|
+
kernel_size: 7
|
82
|
+
padding: 3
|
83
|
+
layer_scale_init_value: 1e-6
|
84
|
+
use_dwconv: True # depth-wise convs
|
85
|
+
num_layers: 2
|
86
|
+
|
87
|
+
num_maskmem: 7
|
88
|
+
image_size: 1024
|
89
|
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90
|
+
sigmoid_scale_for_mem_enc: 20.0
|
91
|
+
sigmoid_bias_for_mem_enc: -10.0
|
92
|
+
use_mask_input_as_output_without_sam: true
|
93
|
+
# Memory
|
94
|
+
directly_add_no_mem_embed: true
|
95
|
+
# use high-resolution feature map in the SAM mask decoder
|
96
|
+
use_high_res_features_in_sam: true
|
97
|
+
# output 3 masks on the first click on initial conditioning frames
|
98
|
+
multimask_output_in_sam: true
|
99
|
+
# SAM heads
|
100
|
+
iou_prediction_use_sigmoid: True
|
101
|
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
102
|
+
use_obj_ptrs_in_encoder: true
|
103
|
+
add_tpos_enc_to_obj_ptrs: false
|
104
|
+
only_obj_ptrs_in_the_past_for_eval: true
|
105
|
+
# object occlusion prediction
|
106
|
+
pred_obj_scores: true
|
107
|
+
pred_obj_scores_mlp: true
|
108
|
+
fixed_no_obj_ptr: true
|
109
|
+
# multimask tracking settings
|
110
|
+
multimask_output_for_tracking: true
|
111
|
+
use_multimask_token_for_obj_ptr: true
|
112
|
+
multimask_min_pt_num: 0
|
113
|
+
multimask_max_pt_num: 1
|
114
|
+
use_mlp_for_obj_ptr_proj: true
|
115
|
+
# Compilation flag
|
116
|
+
compile_image_encoder: False
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# @package _global_
|
2
|
+
|
3
|
+
# Model
|
4
|
+
model:
|
5
|
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6
|
+
image_encoder:
|
7
|
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8
|
+
scalp: 1
|
9
|
+
trunk:
|
10
|
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11
|
+
embed_dim: 96
|
12
|
+
num_heads: 1
|
13
|
+
stages: [1, 2, 7, 2]
|
14
|
+
global_att_blocks: [5, 7, 9]
|
15
|
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16
|
+
neck:
|
17
|
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18
|
+
position_encoding:
|
19
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20
|
+
num_pos_feats: 256
|
21
|
+
normalize: true
|
22
|
+
scale: null
|
23
|
+
temperature: 10000
|
24
|
+
d_model: 256
|
25
|
+
backbone_channel_list: [768, 384, 192, 96]
|
26
|
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27
|
+
fpn_interp_model: nearest
|
28
|
+
|
29
|
+
memory_attention:
|
30
|
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31
|
+
d_model: 256
|
32
|
+
pos_enc_at_input: true
|
33
|
+
layer:
|
34
|
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35
|
+
activation: relu
|
36
|
+
dim_feedforward: 2048
|
37
|
+
dropout: 0.1
|
38
|
+
pos_enc_at_attn: false
|
39
|
+
self_attention:
|
40
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41
|
+
rope_theta: 10000.0
|
42
|
+
feat_sizes: [32, 32]
|
43
|
+
embedding_dim: 256
|
44
|
+
num_heads: 1
|
45
|
+
downsample_rate: 1
|
46
|
+
dropout: 0.1
|
47
|
+
d_model: 256
|
48
|
+
pos_enc_at_cross_attn_keys: true
|
49
|
+
pos_enc_at_cross_attn_queries: false
|
50
|
+
cross_attention:
|
51
|
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52
|
+
rope_theta: 10000.0
|
53
|
+
feat_sizes: [32, 32]
|
54
|
+
rope_k_repeat: True
|
55
|
+
embedding_dim: 256
|
56
|
+
num_heads: 1
|
57
|
+
downsample_rate: 1
|
58
|
+
dropout: 0.1
|
59
|
+
kv_in_dim: 64
|
60
|
+
num_layers: 4
|
61
|
+
|
62
|
+
memory_encoder:
|
63
|
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64
|
+
out_dim: 64
|
65
|
+
position_encoding:
|
66
|
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67
|
+
num_pos_feats: 64
|
68
|
+
normalize: true
|
69
|
+
scale: null
|
70
|
+
temperature: 10000
|
71
|
+
mask_downsampler:
|
72
|
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73
|
+
kernel_size: 3
|
74
|
+
stride: 2
|
75
|
+
padding: 1
|
76
|
+
fuser:
|
77
|
+
_target_: sam2.modeling.memory_encoder.Fuser
|
78
|
+
layer:
|
79
|
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
80
|
+
dim: 256
|
81
|
+
kernel_size: 7
|
82
|
+
padding: 3
|
83
|
+
layer_scale_init_value: 1e-6
|
84
|
+
use_dwconv: True # depth-wise convs
|
85
|
+
num_layers: 2
|
86
|
+
|
87
|
+
num_maskmem: 7
|
88
|
+
image_size: 1024
|
89
|
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90
|
+
# SAM decoder
|
91
|
+
sigmoid_scale_for_mem_enc: 20.0
|
92
|
+
sigmoid_bias_for_mem_enc: -10.0
|
93
|
+
use_mask_input_as_output_without_sam: true
|
94
|
+
# Memory
|
95
|
+
directly_add_no_mem_embed: true
|
96
|
+
# use high-resolution feature map in the SAM mask decoder
|
97
|
+
use_high_res_features_in_sam: true
|
98
|
+
# output 3 masks on the first click on initial conditioning frames
|
99
|
+
multimask_output_in_sam: true
|
100
|
+
# SAM heads
|
101
|
+
iou_prediction_use_sigmoid: True
|
102
|
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103
|
+
use_obj_ptrs_in_encoder: true
|
104
|
+
add_tpos_enc_to_obj_ptrs: false
|
105
|
+
only_obj_ptrs_in_the_past_for_eval: true
|
106
|
+
# object occlusion prediction
|
107
|
+
pred_obj_scores: true
|
108
|
+
pred_obj_scores_mlp: true
|
109
|
+
fixed_no_obj_ptr: true
|
110
|
+
# multimask tracking settings
|
111
|
+
multimask_output_for_tracking: true
|
112
|
+
use_multimask_token_for_obj_ptr: true
|
113
|
+
multimask_min_pt_num: 0
|
114
|
+
multimask_max_pt_num: 1
|
115
|
+
use_mlp_for_obj_ptr_proj: true
|
116
|
+
# Compilation flag
|
117
|
+
# HieraT does not currently support compilation, should always be set to False
|
118
|
+
compile_image_encoder: False
|