tf-models-nightly 2.18.0.dev20240911__py2.py3-none-any.whl → 2.18.0.dev20240912__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- official/vision/configs/backbones.py +2 -0
- official/vision/modeling/backbones/vit.py +52 -26
- official/vision/modeling/backbones/vit_test.py +11 -0
- {tf_models_nightly-2.18.0.dev20240911.dist-info → tf_models_nightly-2.18.0.dev20240912.dist-info}/METADATA +1 -1
- {tf_models_nightly-2.18.0.dev20240911.dist-info → tf_models_nightly-2.18.0.dev20240912.dist-info}/RECORD +9 -9
- {tf_models_nightly-2.18.0.dev20240911.dist-info → tf_models_nightly-2.18.0.dev20240912.dist-info}/AUTHORS +0 -0
- {tf_models_nightly-2.18.0.dev20240911.dist-info → tf_models_nightly-2.18.0.dev20240912.dist-info}/LICENSE +0 -0
- {tf_models_nightly-2.18.0.dev20240911.dist-info → tf_models_nightly-2.18.0.dev20240912.dist-info}/WHEEL +0 -0
- {tf_models_nightly-2.18.0.dev20240911.dist-info → tf_models_nightly-2.18.0.dev20240912.dist-info}/top_level.txt +0 -0
@@ -52,6 +52,8 @@ class VisionTransformer(hyperparams.Config):
|
|
52
52
|
layer_scale_init_value: float = 0.0
|
53
53
|
# Transformer encoder spatial partition dimensions.
|
54
54
|
transformer_partition_dims: Optional[Tuple[int, int, int, int]] = None
|
55
|
+
# If True, output attention scores.
|
56
|
+
output_attention_scores: bool = False
|
55
57
|
|
56
58
|
|
57
59
|
@dataclasses.dataclass
|
@@ -108,22 +108,25 @@ class TokenLayer(layers.Layer):
|
|
108
108
|
class Encoder(layers.Layer):
|
109
109
|
"""Transformer Encoder."""
|
110
110
|
|
111
|
-
def __init__(
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
111
|
+
def __init__(
|
112
|
+
self,
|
113
|
+
num_layers,
|
114
|
+
mlp_dim,
|
115
|
+
num_heads,
|
116
|
+
dropout_rate=0.1,
|
117
|
+
attention_dropout_rate=0.1,
|
118
|
+
kernel_regularizer=None,
|
119
|
+
inputs_positions=None,
|
120
|
+
init_stochastic_depth_rate=0.0,
|
121
|
+
kernel_initializer='glorot_uniform',
|
122
|
+
add_pos_embed=True,
|
123
|
+
pos_embed_origin_shape=None,
|
124
|
+
pos_embed_target_shape=None,
|
125
|
+
layer_scale_init_value=0.0,
|
126
|
+
transformer_partition_dims=None,
|
127
|
+
output_attention_scores=False,
|
128
|
+
**kwargs,
|
129
|
+
):
|
127
130
|
super().__init__(**kwargs)
|
128
131
|
self._num_layers = num_layers
|
129
132
|
self._mlp_dim = mlp_dim
|
@@ -139,6 +142,7 @@ class Encoder(layers.Layer):
|
|
139
142
|
self._pos_embed_target_shape = pos_embed_target_shape
|
140
143
|
self._layer_scale_init_value = layer_scale_init_value
|
141
144
|
self._transformer_partition_dims = transformer_partition_dims
|
145
|
+
self._output_attention_scores = output_attention_scores
|
142
146
|
|
143
147
|
def build(self, input_shape):
|
144
148
|
if self._add_pos_embed:
|
@@ -163,10 +167,13 @@ class Encoder(layers.Layer):
|
|
163
167
|
kernel_initializer=self._kernel_initializer,
|
164
168
|
norm_first=True,
|
165
169
|
stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate(
|
166
|
-
self._init_stochastic_depth_rate, i + 1, self._num_layers
|
170
|
+
self._init_stochastic_depth_rate, i + 1, self._num_layers
|
171
|
+
),
|
167
172
|
norm_epsilon=1e-6,
|
168
173
|
layer_scale_init_value=self._layer_scale_init_value,
|
169
|
-
transformer_partition_dims=self._transformer_partition_dims
|
174
|
+
transformer_partition_dims=self._transformer_partition_dims,
|
175
|
+
return_attention_scores=self._output_attention_scores,
|
176
|
+
)
|
170
177
|
self._encoder_layers.append(encoder_layer)
|
171
178
|
self._norm = layers.LayerNormalization(epsilon=1e-6)
|
172
179
|
super().build(input_shape)
|
@@ -177,9 +184,16 @@ class Encoder(layers.Layer):
|
|
177
184
|
x = self._pos_embed(x, inputs_positions=self._inputs_positions)
|
178
185
|
x = self._dropout(x, training=training)
|
179
186
|
|
187
|
+
attention_scores = None # Needed to suppress undefined-variable warning.
|
180
188
|
for encoder_layer in self._encoder_layers:
|
181
|
-
|
189
|
+
if self._output_attention_scores:
|
190
|
+
x, attention_scores = encoder_layer(x, training=training)
|
191
|
+
else:
|
192
|
+
x = encoder_layer(x, training=training)
|
182
193
|
x = self._norm(x)
|
194
|
+
|
195
|
+
if self._output_attention_scores:
|
196
|
+
return x, attention_scores
|
183
197
|
return x
|
184
198
|
|
185
199
|
def get_config(self):
|
@@ -199,6 +213,7 @@ class Encoder(layers.Layer):
|
|
199
213
|
'pos_embed_target_shape': self._pos_embed_target_shape,
|
200
214
|
'layer_scale_init_value': self._layer_scale_init_value,
|
201
215
|
'transformer_partition_dims': self._transformer_partition_dims,
|
216
|
+
'output_attention_scores': self._output_attention_scores,
|
202
217
|
}
|
203
218
|
config.update(updates)
|
204
219
|
return config
|
@@ -227,6 +242,7 @@ class VisionTransformer(tf_keras.Model):
|
|
227
242
|
pos_embed_shape: Optional[Tuple[int, int]] = None,
|
228
243
|
layer_scale_init_value: float = 0.0,
|
229
244
|
transformer_partition_dims: Optional[Tuple[int, int, int, int]] = None,
|
245
|
+
output_attention_scores: bool = False,
|
230
246
|
):
|
231
247
|
"""VisionTransformer initialization function."""
|
232
248
|
self._mlp_dim = mlp_dim
|
@@ -265,20 +281,29 @@ class VisionTransformer(tf_keras.Model):
|
|
265
281
|
if pooler == 'token':
|
266
282
|
x = TokenLayer(name='cls')(x)
|
267
283
|
|
268
|
-
|
284
|
+
encoder_output = Encoder(
|
269
285
|
num_layers=num_layers,
|
270
286
|
mlp_dim=mlp_dim,
|
271
287
|
num_heads=num_heads,
|
272
288
|
dropout_rate=dropout_rate,
|
273
289
|
attention_dropout_rate=attention_dropout_rate,
|
274
290
|
kernel_regularizer=kernel_regularizer,
|
275
|
-
kernel_initializer='glorot_uniform'
|
276
|
-
|
291
|
+
kernel_initializer='glorot_uniform'
|
292
|
+
if original_init
|
293
|
+
else dict(class_name='TruncatedNormal', config=dict(stddev=0.02)),
|
277
294
|
init_stochastic_depth_rate=init_stochastic_depth_rate,
|
278
295
|
pos_embed_origin_shape=pos_embed_shape,
|
279
296
|
pos_embed_target_shape=pos_embed_target_shape,
|
280
|
-
layer_scale_init_value=layer_scale_init_value
|
281
|
-
|
297
|
+
layer_scale_init_value=layer_scale_init_value,
|
298
|
+
output_attention_scores=output_attention_scores,
|
299
|
+
)(x)
|
300
|
+
|
301
|
+
endpoints = {}
|
302
|
+
if output_attention_scores:
|
303
|
+
x, attention_scores = encoder_output
|
304
|
+
endpoints['attention_scores'] = attention_scores
|
305
|
+
else:
|
306
|
+
x = encoder_output
|
282
307
|
|
283
308
|
if pooler == 'token':
|
284
309
|
output_feature = x[:, 1:]
|
@@ -292,7 +317,6 @@ class VisionTransformer(tf_keras.Model):
|
|
292
317
|
else:
|
293
318
|
raise ValueError(f'unrecognized pooler type: {pooler}')
|
294
319
|
|
295
|
-
endpoints = {}
|
296
320
|
if output_2d_feature_maps:
|
297
321
|
# Use the closest feature level.
|
298
322
|
feat_level = round(math.log2(patch_size))
|
@@ -376,4 +400,6 @@ def build_vit(input_specs,
|
|
376
400
|
output_2d_feature_maps=backbone_cfg.output_2d_feature_maps,
|
377
401
|
layer_scale_init_value=backbone_cfg.layer_scale_init_value,
|
378
402
|
pos_embed_shape=backbone_cfg.pos_embed_shape,
|
379
|
-
transformer_partition_dims=backbone_cfg.transformer_partition_dims
|
403
|
+
transformer_partition_dims=backbone_cfg.transformer_partition_dims,
|
404
|
+
output_attention_scores=backbone_cfg.output_attention_scores,
|
405
|
+
)
|
@@ -95,6 +95,17 @@ class VisionTransformerTest(parameterized.TestCase, tf.test.TestCase):
|
|
95
95
|
output = network(inputs)['pre_logits']
|
96
96
|
self.assertEqual(output.shape, [1, 1, 1, 768])
|
97
97
|
|
98
|
+
def test_attention_scores(self):
|
99
|
+
tf_keras.backend.set_image_data_format('channels_last')
|
100
|
+
input_specs = tf_keras.layers.InputSpec(shape=[2, 224, 224, 3])
|
101
|
+
network = vit.VisionTransformer(
|
102
|
+
input_specs=input_specs, output_attention_scores=True
|
103
|
+
)
|
104
|
+
|
105
|
+
inputs = tf_keras.Input(shape=(224, 224, 3), batch_size=1)
|
106
|
+
outputs = network(inputs)
|
107
|
+
self.assertEqual(outputs['attention_scores'].shape, [1, 12, 197, 197])
|
108
|
+
|
98
109
|
|
99
110
|
if __name__ == '__main__':
|
100
111
|
tf.test.main()
|
@@ -971,7 +971,7 @@ official/vision/registry_imports.py,sha256=__tuPecJUjyfXgiFst8ZJJT5OljeleDFT7c5i
|
|
971
971
|
official/vision/train.py,sha256=8h7lbaC6WCgF7XLvr2tG3-hLhbtBaoP3a6bAt3E8QeU,4005
|
972
972
|
official/vision/train_spatial_partitioning.py,sha256=xpEusyM-fEPVGyBuzlkRgsYGaPAobjypSjpapdj0-ec,5735
|
973
973
|
official/vision/configs/__init__.py,sha256=-iKVbGCvFMGSHMC89utzGXvZ83BhW6JnbEfS38lbW3M,1045
|
974
|
-
official/vision/configs/backbones.py,sha256=
|
974
|
+
official/vision/configs/backbones.py,sha256=BNvcKNqX1wmYT-OzZ_34rBLeFRK579kiWYh9PGV-aYw,5901
|
975
975
|
official/vision/configs/backbones_3d.py,sha256=0lJsUzeYmuC5xiosOwrqlmgR1gkOa4tpSaxDbYYU7FE,3614
|
976
976
|
official/vision/configs/common.py,sha256=arlsSF6_Q7Ng8WxqVpPqPHgAwCnGdskHs-DAQmqzu5I,6566
|
977
977
|
official/vision/configs/decoders.py,sha256=_wG6MH1RzYuhMrvJu5menR7gDvklEXDSxlHJwzVF6H4,2080
|
@@ -1079,9 +1079,9 @@ official/vision/modeling/backbones/spinenet.py,sha256=FOCafyw_ZVIY76gzpiY8Al4mXr
|
|
1079
1079
|
official/vision/modeling/backbones/spinenet_mobile.py,sha256=x2gcLs-caun6M9SYCgG9sKAm9YE3g6MfdT5YyvsBboA,20784
|
1080
1080
|
official/vision/modeling/backbones/spinenet_mobile_test.py,sha256=Me6bQCfu1T3gliPsRj-MuMF4hREZb7SRPpPDJS_9LzY,3958
|
1081
1081
|
official/vision/modeling/backbones/spinenet_test.py,sha256=Xf2N2IiN-x_cnL5_p22LPy06FjqG6XpCuVCySwcZUDE,4734
|
1082
|
-
official/vision/modeling/backbones/vit.py,sha256=-
|
1082
|
+
official/vision/modeling/backbones/vit.py,sha256=-qztjnzym-UO37nJCuSjlqCzkb9VPrXS2ZXs6CcbHto,15181
|
1083
1083
|
official/vision/modeling/backbones/vit_specs.py,sha256=fJv6xLkpViz8W-ovBH6j76AWb09X2P2-6vDAkdH9Ezs,2412
|
1084
|
-
official/vision/modeling/backbones/vit_test.py,sha256=
|
1084
|
+
official/vision/modeling/backbones/vit_test.py,sha256=K4y-BPqCCcdqPZUSUuFazQwG_UPVJGn-XWI0FGtazhQ,3901
|
1085
1085
|
official/vision/modeling/decoders/__init__.py,sha256=9u1sY2dRGvnCY678wBo4SjKOI8ywVYn_JvtwohhRlYM,815
|
1086
1086
|
official/vision/modeling/decoders/aspp.py,sha256=tbHitR7pUvA5wiXsVpK904GjOOZLtXuJLq_FYlPLwTE,8610
|
1087
1087
|
official/vision/modeling/decoders/aspp_test.py,sha256=w99TQPM2jaOo18XdUcZz_iWt6hpC_2WDAVq-cdbFWR8,3011
|
@@ -1222,9 +1222,9 @@ tensorflow_models/tensorflow_models_test.py,sha256=nc6A9K53OGqF25xN5St8EiWvdVbda
|
|
1222
1222
|
tensorflow_models/nlp/__init__.py,sha256=4tA5Pf4qaFwT-fIFOpX7x7FHJpnyJT-5UgOeFYTyMlc,807
|
1223
1223
|
tensorflow_models/uplift/__init__.py,sha256=mqfa55gweOdpKoaQyid4A_4u7xw__FcQeSIF0k_pYmI,999
|
1224
1224
|
tensorflow_models/vision/__init__.py,sha256=zBorY_v5xva1uI-qxhZO3Qh-Dii-Suq6wEYh6hKHDfc,833
|
1225
|
-
tf_models_nightly-2.18.0.
|
1226
|
-
tf_models_nightly-2.18.0.
|
1227
|
-
tf_models_nightly-2.18.0.
|
1228
|
-
tf_models_nightly-2.18.0.
|
1229
|
-
tf_models_nightly-2.18.0.
|
1230
|
-
tf_models_nightly-2.18.0.
|
1225
|
+
tf_models_nightly-2.18.0.dev20240912.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
|
1226
|
+
tf_models_nightly-2.18.0.dev20240912.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
|
1227
|
+
tf_models_nightly-2.18.0.dev20240912.dist-info/METADATA,sha256=DSdv3ZNz6oi2xj_9C5HPYHoh0h1dxv8tz_LER44-4Ms,1432
|
1228
|
+
tf_models_nightly-2.18.0.dev20240912.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
1229
|
+
tf_models_nightly-2.18.0.dev20240912.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
|
1230
|
+
tf_models_nightly-2.18.0.dev20240912.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|