tf-models-nightly 2.18.0.dev20240911__py2.py3-none-any.whl → 2.18.0.dev20240912__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,6 +52,8 @@ class VisionTransformer(hyperparams.Config):
52
52
  layer_scale_init_value: float = 0.0
53
53
  # Transformer encoder spatial partition dimensions.
54
54
  transformer_partition_dims: Optional[Tuple[int, int, int, int]] = None
55
+ # If True, output attention scores.
56
+ output_attention_scores: bool = False
55
57
 
56
58
 
57
59
  @dataclasses.dataclass
@@ -108,22 +108,25 @@ class TokenLayer(layers.Layer):
108
108
  class Encoder(layers.Layer):
109
109
  """Transformer Encoder."""
110
110
 
111
- def __init__(self,
112
- num_layers,
113
- mlp_dim,
114
- num_heads,
115
- dropout_rate=0.1,
116
- attention_dropout_rate=0.1,
117
- kernel_regularizer=None,
118
- inputs_positions=None,
119
- init_stochastic_depth_rate=0.0,
120
- kernel_initializer='glorot_uniform',
121
- add_pos_embed=True,
122
- pos_embed_origin_shape=None,
123
- pos_embed_target_shape=None,
124
- layer_scale_init_value=0.0,
125
- transformer_partition_dims=None,
126
- **kwargs):
111
+ def __init__(
112
+ self,
113
+ num_layers,
114
+ mlp_dim,
115
+ num_heads,
116
+ dropout_rate=0.1,
117
+ attention_dropout_rate=0.1,
118
+ kernel_regularizer=None,
119
+ inputs_positions=None,
120
+ init_stochastic_depth_rate=0.0,
121
+ kernel_initializer='glorot_uniform',
122
+ add_pos_embed=True,
123
+ pos_embed_origin_shape=None,
124
+ pos_embed_target_shape=None,
125
+ layer_scale_init_value=0.0,
126
+ transformer_partition_dims=None,
127
+ output_attention_scores=False,
128
+ **kwargs,
129
+ ):
127
130
  super().__init__(**kwargs)
128
131
  self._num_layers = num_layers
129
132
  self._mlp_dim = mlp_dim
@@ -139,6 +142,7 @@ class Encoder(layers.Layer):
139
142
  self._pos_embed_target_shape = pos_embed_target_shape
140
143
  self._layer_scale_init_value = layer_scale_init_value
141
144
  self._transformer_partition_dims = transformer_partition_dims
145
+ self._output_attention_scores = output_attention_scores
142
146
 
143
147
  def build(self, input_shape):
144
148
  if self._add_pos_embed:
@@ -163,10 +167,13 @@ class Encoder(layers.Layer):
163
167
  kernel_initializer=self._kernel_initializer,
164
168
  norm_first=True,
165
169
  stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate(
166
- self._init_stochastic_depth_rate, i + 1, self._num_layers),
170
+ self._init_stochastic_depth_rate, i + 1, self._num_layers
171
+ ),
167
172
  norm_epsilon=1e-6,
168
173
  layer_scale_init_value=self._layer_scale_init_value,
169
- transformer_partition_dims=self._transformer_partition_dims)
174
+ transformer_partition_dims=self._transformer_partition_dims,
175
+ return_attention_scores=self._output_attention_scores,
176
+ )
170
177
  self._encoder_layers.append(encoder_layer)
171
178
  self._norm = layers.LayerNormalization(epsilon=1e-6)
172
179
  super().build(input_shape)
@@ -177,9 +184,16 @@ class Encoder(layers.Layer):
177
184
  x = self._pos_embed(x, inputs_positions=self._inputs_positions)
178
185
  x = self._dropout(x, training=training)
179
186
 
187
+ attention_scores = None # Needed to suppress undefined-variable warning.
180
188
  for encoder_layer in self._encoder_layers:
181
- x = encoder_layer(x, training=training)
189
+ if self._output_attention_scores:
190
+ x, attention_scores = encoder_layer(x, training=training)
191
+ else:
192
+ x = encoder_layer(x, training=training)
182
193
  x = self._norm(x)
194
+
195
+ if self._output_attention_scores:
196
+ return x, attention_scores
183
197
  return x
184
198
 
185
199
  def get_config(self):
@@ -199,6 +213,7 @@ class Encoder(layers.Layer):
199
213
  'pos_embed_target_shape': self._pos_embed_target_shape,
200
214
  'layer_scale_init_value': self._layer_scale_init_value,
201
215
  'transformer_partition_dims': self._transformer_partition_dims,
216
+ 'output_attention_scores': self._output_attention_scores,
202
217
  }
203
218
  config.update(updates)
204
219
  return config
@@ -227,6 +242,7 @@ class VisionTransformer(tf_keras.Model):
227
242
  pos_embed_shape: Optional[Tuple[int, int]] = None,
228
243
  layer_scale_init_value: float = 0.0,
229
244
  transformer_partition_dims: Optional[Tuple[int, int, int, int]] = None,
245
+ output_attention_scores: bool = False,
230
246
  ):
231
247
  """VisionTransformer initialization function."""
232
248
  self._mlp_dim = mlp_dim
@@ -265,20 +281,29 @@ class VisionTransformer(tf_keras.Model):
265
281
  if pooler == 'token':
266
282
  x = TokenLayer(name='cls')(x)
267
283
 
268
- x = Encoder(
284
+ encoder_output = Encoder(
269
285
  num_layers=num_layers,
270
286
  mlp_dim=mlp_dim,
271
287
  num_heads=num_heads,
272
288
  dropout_rate=dropout_rate,
273
289
  attention_dropout_rate=attention_dropout_rate,
274
290
  kernel_regularizer=kernel_regularizer,
275
- kernel_initializer='glorot_uniform' if original_init else dict(
276
- class_name='TruncatedNormal', config=dict(stddev=.02)),
291
+ kernel_initializer='glorot_uniform'
292
+ if original_init
293
+ else dict(class_name='TruncatedNormal', config=dict(stddev=0.02)),
277
294
  init_stochastic_depth_rate=init_stochastic_depth_rate,
278
295
  pos_embed_origin_shape=pos_embed_shape,
279
296
  pos_embed_target_shape=pos_embed_target_shape,
280
- layer_scale_init_value=layer_scale_init_value)(
281
- x)
297
+ layer_scale_init_value=layer_scale_init_value,
298
+ output_attention_scores=output_attention_scores,
299
+ )(x)
300
+
301
+ endpoints = {}
302
+ if output_attention_scores:
303
+ x, attention_scores = encoder_output
304
+ endpoints['attention_scores'] = attention_scores
305
+ else:
306
+ x = encoder_output
282
307
 
283
308
  if pooler == 'token':
284
309
  output_feature = x[:, 1:]
@@ -292,7 +317,6 @@ class VisionTransformer(tf_keras.Model):
292
317
  else:
293
318
  raise ValueError(f'unrecognized pooler type: {pooler}')
294
319
 
295
- endpoints = {}
296
320
  if output_2d_feature_maps:
297
321
  # Use the closest feature level.
298
322
  feat_level = round(math.log2(patch_size))
@@ -376,4 +400,6 @@ def build_vit(input_specs,
376
400
  output_2d_feature_maps=backbone_cfg.output_2d_feature_maps,
377
401
  layer_scale_init_value=backbone_cfg.layer_scale_init_value,
378
402
  pos_embed_shape=backbone_cfg.pos_embed_shape,
379
- transformer_partition_dims=backbone_cfg.transformer_partition_dims)
403
+ transformer_partition_dims=backbone_cfg.transformer_partition_dims,
404
+ output_attention_scores=backbone_cfg.output_attention_scores,
405
+ )
@@ -95,6 +95,17 @@ class VisionTransformerTest(parameterized.TestCase, tf.test.TestCase):
95
95
  output = network(inputs)['pre_logits']
96
96
  self.assertEqual(output.shape, [1, 1, 1, 768])
97
97
 
98
+ def test_attention_scores(self):
99
+ tf_keras.backend.set_image_data_format('channels_last')
100
+ input_specs = tf_keras.layers.InputSpec(shape=[2, 224, 224, 3])
101
+ network = vit.VisionTransformer(
102
+ input_specs=input_specs, output_attention_scores=True
103
+ )
104
+
105
+ inputs = tf_keras.Input(shape=(224, 224, 3), batch_size=1)
106
+ outputs = network(inputs)
107
+ self.assertEqual(outputs['attention_scores'].shape, [1, 12, 197, 197])
108
+
98
109
 
99
110
  if __name__ == '__main__':
100
111
  tf.test.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tf-models-nightly
3
- Version: 2.18.0.dev20240911
3
+ Version: 2.18.0.dev20240912
4
4
  Summary: TensorFlow Official Models
5
5
  Home-page: https://github.com/tensorflow/models
6
6
  Author: Google Inc.
@@ -971,7 +971,7 @@ official/vision/registry_imports.py,sha256=__tuPecJUjyfXgiFst8ZJJT5OljeleDFT7c5i
971
971
  official/vision/train.py,sha256=8h7lbaC6WCgF7XLvr2tG3-hLhbtBaoP3a6bAt3E8QeU,4005
972
972
  official/vision/train_spatial_partitioning.py,sha256=xpEusyM-fEPVGyBuzlkRgsYGaPAobjypSjpapdj0-ec,5735
973
973
  official/vision/configs/__init__.py,sha256=-iKVbGCvFMGSHMC89utzGXvZ83BhW6JnbEfS38lbW3M,1045
974
- official/vision/configs/backbones.py,sha256=hlMwjUbpoAk_MPezXFcaAkDTBqyu7QGWTfrkeIAIG-E,5823
974
+ official/vision/configs/backbones.py,sha256=BNvcKNqX1wmYT-OzZ_34rBLeFRK579kiWYh9PGV-aYw,5901
975
975
  official/vision/configs/backbones_3d.py,sha256=0lJsUzeYmuC5xiosOwrqlmgR1gkOa4tpSaxDbYYU7FE,3614
976
976
  official/vision/configs/common.py,sha256=arlsSF6_Q7Ng8WxqVpPqPHgAwCnGdskHs-DAQmqzu5I,6566
977
977
  official/vision/configs/decoders.py,sha256=_wG6MH1RzYuhMrvJu5menR7gDvklEXDSxlHJwzVF6H4,2080
@@ -1079,9 +1079,9 @@ official/vision/modeling/backbones/spinenet.py,sha256=FOCafyw_ZVIY76gzpiY8Al4mXr
1079
1079
  official/vision/modeling/backbones/spinenet_mobile.py,sha256=x2gcLs-caun6M9SYCgG9sKAm9YE3g6MfdT5YyvsBboA,20784
1080
1080
  official/vision/modeling/backbones/spinenet_mobile_test.py,sha256=Me6bQCfu1T3gliPsRj-MuMF4hREZb7SRPpPDJS_9LzY,3958
1081
1081
  official/vision/modeling/backbones/spinenet_test.py,sha256=Xf2N2IiN-x_cnL5_p22LPy06FjqG6XpCuVCySwcZUDE,4734
1082
- official/vision/modeling/backbones/vit.py,sha256=-ErjrdAVH_lRZO320LjcP4PE4zy_vGJdlUqTYZI9Tvc,14438
1082
+ official/vision/modeling/backbones/vit.py,sha256=-qztjnzym-UO37nJCuSjlqCzkb9VPrXS2ZXs6CcbHto,15181
1083
1083
  official/vision/modeling/backbones/vit_specs.py,sha256=fJv6xLkpViz8W-ovBH6j76AWb09X2P2-6vDAkdH9Ezs,2412
1084
- official/vision/modeling/backbones/vit_test.py,sha256=JzhzEaOfq1oQhTcfZF40S3iId1J8HsQvphoznAzo-MY,3463
1084
+ official/vision/modeling/backbones/vit_test.py,sha256=K4y-BPqCCcdqPZUSUuFazQwG_UPVJGn-XWI0FGtazhQ,3901
1085
1085
  official/vision/modeling/decoders/__init__.py,sha256=9u1sY2dRGvnCY678wBo4SjKOI8ywVYn_JvtwohhRlYM,815
1086
1086
  official/vision/modeling/decoders/aspp.py,sha256=tbHitR7pUvA5wiXsVpK904GjOOZLtXuJLq_FYlPLwTE,8610
1087
1087
  official/vision/modeling/decoders/aspp_test.py,sha256=w99TQPM2jaOo18XdUcZz_iWt6hpC_2WDAVq-cdbFWR8,3011
@@ -1222,9 +1222,9 @@ tensorflow_models/tensorflow_models_test.py,sha256=nc6A9K53OGqF25xN5St8EiWvdVbda
1222
1222
  tensorflow_models/nlp/__init__.py,sha256=4tA5Pf4qaFwT-fIFOpX7x7FHJpnyJT-5UgOeFYTyMlc,807
1223
1223
  tensorflow_models/uplift/__init__.py,sha256=mqfa55gweOdpKoaQyid4A_4u7xw__FcQeSIF0k_pYmI,999
1224
1224
  tensorflow_models/vision/__init__.py,sha256=zBorY_v5xva1uI-qxhZO3Qh-Dii-Suq6wEYh6hKHDfc,833
1225
- tf_models_nightly-2.18.0.dev20240911.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
1226
- tf_models_nightly-2.18.0.dev20240911.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
1227
- tf_models_nightly-2.18.0.dev20240911.dist-info/METADATA,sha256=nqTkSETn7Mnd-KUOpzZerGxqtI6pMreziyNvlcP-QKg,1432
1228
- tf_models_nightly-2.18.0.dev20240911.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
1229
- tf_models_nightly-2.18.0.dev20240911.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
1230
- tf_models_nightly-2.18.0.dev20240911.dist-info/RECORD,,
1225
+ tf_models_nightly-2.18.0.dev20240912.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
1226
+ tf_models_nightly-2.18.0.dev20240912.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
1227
+ tf_models_nightly-2.18.0.dev20240912.dist-info/METADATA,sha256=DSdv3ZNz6oi2xj_9C5HPYHoh0h1dxv8tz_LER44-4Ms,1432
1228
+ tf_models_nightly-2.18.0.dev20240912.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
1229
+ tf_models_nightly-2.18.0.dev20240912.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
1230
+ tf_models_nightly-2.18.0.dev20240912.dist-info/RECORD,,