keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +6 -0
- keras_hub/models/__init__.py +21 -0
- keras_hub/src/layers/modeling/position_embedding.py +21 -6
- keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
- keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
- keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
- keras_hub/src/models/backbone.py +10 -15
- keras_hub/src/models/d_fine/__init__.py +0 -0
- keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
- keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
- keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
- keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
- keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
- keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
- keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
- keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
- keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
- keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
- keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
- keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
- keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
- keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
- keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
- keras_hub/src/models/parseq/__init__.py +0 -0
- keras_hub/src/models/parseq/parseq_backbone.py +134 -0
- keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
- keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
- keras_hub/src/models/parseq/parseq_decoder.py +418 -0
- keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
- keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
- keras_hub/src/tests/test_case.py +37 -1
- keras_hub/src/utils/preset_utils.py +49 -0
- keras_hub/src/utils/tensor_utils.py +23 -1
- keras_hub/src/utils/transformers/convert_vit.py +4 -1
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,642 @@
|
|
1
|
+
import keras
|
2
|
+
|
3
|
+
from keras_hub.src.models.d_fine.d_fine_encoder import DFineEncoder
|
4
|
+
from keras_hub.src.models.d_fine.d_fine_layers import DFineConvNormLayer
|
5
|
+
from keras_hub.src.models.d_fine.d_fine_layers import (
|
6
|
+
DFineFeatureAggregationBlock,
|
7
|
+
)
|
8
|
+
from keras_hub.src.models.d_fine.d_fine_layers import DFineSCDown
|
9
|
+
|
10
|
+
|
11
|
+
class DFineHybridEncoder(keras.layers.Layer):
|
12
|
+
"""Hybrid encoder for the D-FINE model.
|
13
|
+
|
14
|
+
This layer sits between the HGNetV2 backbone (`HGNetV2Backbone`) and the
|
15
|
+
main `DFineDecoder`. It takes multi-scale feature maps from the backbone,
|
16
|
+
optionally refines them with transformer-based `DFineEncoder` layers, and
|
17
|
+
then fuses them using a Feature Pyramid Network (FPN) top-down pathway and a
|
18
|
+
Path Aggregation Network (PAN) bottom-up pathway. The resulting enriched
|
19
|
+
feature maps serve as the key and value inputs for the decoder's
|
20
|
+
cross-attention mechanism.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
encoder_in_channels: list of int, Input channel dimensions for each
|
24
|
+
feature level from the backbone.
|
25
|
+
feat_strides: list of int, Stride values for each feature level,
|
26
|
+
indicating the downsampling factor relative to the input image.
|
27
|
+
encoder_hidden_dim: int, Hidden dimension size used throughout the
|
28
|
+
encoder for feature projection and attention computation.
|
29
|
+
encode_proj_layers: list of int, Indices of feature levels to apply
|
30
|
+
transformer encoding to. Not all levels need transformer
|
31
|
+
processing.
|
32
|
+
positional_encoding_temperature: float, Temperature parameter for
|
33
|
+
sinusoidal positional embeddings used in transformer attention.
|
34
|
+
eval_size: tuple or None, Fixed evaluation size `(height, width)` for
|
35
|
+
consistent positional embeddings during inference. If `None`,
|
36
|
+
dynamic sizing is used.
|
37
|
+
normalize_before: bool, Whether to apply layer normalization before
|
38
|
+
attention and feed-forward operations in transformer layers.
|
39
|
+
num_attention_heads: int, Number of attention heads in multi-head
|
40
|
+
attention mechanisms within transformer layers.
|
41
|
+
dropout: float, Dropout probability applied to attention weights and
|
42
|
+
feed-forward networks for regularization.
|
43
|
+
layer_norm_eps: float, Small epsilon value for numerical stability in
|
44
|
+
layer normalization operations.
|
45
|
+
encoder_activation_function: str, Activation function used in
|
46
|
+
transformer feed-forward networks (e.g., `"relu"`, `"gelu"`).
|
47
|
+
activation_dropout: float, Dropout probability specifically applied to
|
48
|
+
activation functions in feed-forward networks.
|
49
|
+
encoder_ffn_dim: int, Hidden dimension size for feed-forward networks
|
50
|
+
within transformer layers.
|
51
|
+
num_encoder_layers: int, Number of transformer encoder layers to apply
|
52
|
+
at each selected feature level.
|
53
|
+
batch_norm_eps: float, Small epsilon value for numerical stability in
|
54
|
+
batch normalization operations used in components.
|
55
|
+
hidden_expansion: float, Expansion factor for hidden dimensions in
|
56
|
+
`DFineFeatureAggregationBlock` blocks used in FPN and PAN pathways.
|
57
|
+
depth_multiplier: float, Depth multiplier for scaling the number of
|
58
|
+
blocks in `DFineFeatureAggregationBlock` modules.
|
59
|
+
kernel_initializer: str or Initializer, optional, Initializer for
|
60
|
+
the kernel weights of each layer. Defaults to
|
61
|
+
`"glorot_uniform"`.
|
62
|
+
bias_initializer: str or Initializer, optional, Initializer for
|
63
|
+
the bias weights of each layer. Defaults to
|
64
|
+
`"zeros"`.
|
65
|
+
channel_axis: int, optional, The channel axis. Defaults to `None`.
|
66
|
+
data_format: str, optional, The data format. Defaults to `None`.
|
67
|
+
**kwargs: Additional keyword arguments passed to the parent class.
|
68
|
+
"""
|
69
|
+
|
70
|
+
def __init__(
|
71
|
+
self,
|
72
|
+
encoder_in_channels,
|
73
|
+
feat_strides,
|
74
|
+
encoder_hidden_dim,
|
75
|
+
encode_proj_layers,
|
76
|
+
positional_encoding_temperature,
|
77
|
+
eval_size,
|
78
|
+
normalize_before,
|
79
|
+
num_attention_heads,
|
80
|
+
dropout,
|
81
|
+
layer_norm_eps,
|
82
|
+
encoder_activation_function,
|
83
|
+
activation_dropout,
|
84
|
+
encoder_ffn_dim,
|
85
|
+
num_encoder_layers,
|
86
|
+
batch_norm_eps,
|
87
|
+
hidden_expansion,
|
88
|
+
depth_multiplier,
|
89
|
+
kernel_initializer="glorot_uniform",
|
90
|
+
bias_initializer="zeros",
|
91
|
+
channel_axis=None,
|
92
|
+
data_format=None,
|
93
|
+
dtype=None,
|
94
|
+
**kwargs,
|
95
|
+
):
|
96
|
+
super().__init__(dtype=dtype, **kwargs)
|
97
|
+
|
98
|
+
self.encoder_in_channels = encoder_in_channels
|
99
|
+
self.num_fpn_stages = len(self.encoder_in_channels) - 1
|
100
|
+
self.feat_strides = feat_strides
|
101
|
+
self.encoder_hidden_dim = encoder_hidden_dim
|
102
|
+
self.encode_proj_layers = encode_proj_layers
|
103
|
+
self.positional_encoding_temperature = positional_encoding_temperature
|
104
|
+
self.eval_size = eval_size
|
105
|
+
self.out_channels = [
|
106
|
+
self.encoder_hidden_dim for _ in self.encoder_in_channels
|
107
|
+
]
|
108
|
+
self.out_strides = self.feat_strides
|
109
|
+
self.depth_multiplier = depth_multiplier
|
110
|
+
self.num_encoder_layers = num_encoder_layers
|
111
|
+
self.normalize_before = normalize_before
|
112
|
+
self.num_attention_heads = num_attention_heads
|
113
|
+
self.dropout_rate = dropout
|
114
|
+
self.layer_norm_eps = layer_norm_eps
|
115
|
+
self.encoder_activation_function = encoder_activation_function
|
116
|
+
self.activation_dropout_rate = activation_dropout
|
117
|
+
self.encoder_ffn_dim = encoder_ffn_dim
|
118
|
+
self.batch_norm_eps = batch_norm_eps
|
119
|
+
self.hidden_expansion = hidden_expansion
|
120
|
+
self.kernel_initializer = kernel_initializer
|
121
|
+
self.bias_initializer = bias_initializer
|
122
|
+
self.channel_axis = channel_axis
|
123
|
+
self.data_format = data_format
|
124
|
+
|
125
|
+
self.encoder = [
|
126
|
+
DFineEncoder(
|
127
|
+
normalize_before=self.normalize_before,
|
128
|
+
encoder_hidden_dim=self.encoder_hidden_dim,
|
129
|
+
num_attention_heads=self.num_attention_heads,
|
130
|
+
dropout=self.dropout_rate,
|
131
|
+
layer_norm_eps=self.layer_norm_eps,
|
132
|
+
encoder_activation_function=self.encoder_activation_function,
|
133
|
+
activation_dropout=self.activation_dropout_rate,
|
134
|
+
encoder_ffn_dim=self.encoder_ffn_dim,
|
135
|
+
dtype=self.dtype_policy,
|
136
|
+
num_encoder_layers=self.num_encoder_layers,
|
137
|
+
kernel_initializer=self.kernel_initializer,
|
138
|
+
bias_initializer=self.bias_initializer,
|
139
|
+
name=f"d_fine_encoder_{i}",
|
140
|
+
)
|
141
|
+
for i in range(len(self.encode_proj_layers))
|
142
|
+
]
|
143
|
+
|
144
|
+
self.lateral_convs = []
|
145
|
+
self.fpn_blocks = []
|
146
|
+
for i in range(len(self.encoder_in_channels) - 1, 0, -1):
|
147
|
+
lateral_layer = DFineConvNormLayer(
|
148
|
+
filters=self.encoder_hidden_dim,
|
149
|
+
kernel_size=1,
|
150
|
+
batch_norm_eps=self.batch_norm_eps,
|
151
|
+
stride=1,
|
152
|
+
groups=1,
|
153
|
+
padding=0,
|
154
|
+
activation_function=None,
|
155
|
+
dtype=self.dtype_policy,
|
156
|
+
kernel_initializer=self.kernel_initializer,
|
157
|
+
bias_initializer=self.bias_initializer,
|
158
|
+
channel_axis=self.channel_axis,
|
159
|
+
name=f"lateral_conv_{i}",
|
160
|
+
)
|
161
|
+
self.lateral_convs.append(lateral_layer)
|
162
|
+
num_blocks = round(3 * self.depth_multiplier)
|
163
|
+
fpn_layer = DFineFeatureAggregationBlock(
|
164
|
+
encoder_hidden_dim=self.encoder_hidden_dim,
|
165
|
+
hidden_expansion=self.hidden_expansion,
|
166
|
+
batch_norm_eps=self.batch_norm_eps,
|
167
|
+
activation_function="silu",
|
168
|
+
num_blocks=num_blocks,
|
169
|
+
dtype=self.dtype_policy,
|
170
|
+
kernel_initializer=self.kernel_initializer,
|
171
|
+
bias_initializer=self.bias_initializer,
|
172
|
+
channel_axis=self.channel_axis,
|
173
|
+
name=f"fpn_block_{i}",
|
174
|
+
)
|
175
|
+
self.fpn_blocks.append(fpn_layer)
|
176
|
+
|
177
|
+
self.downsample_convs = []
|
178
|
+
self.pan_blocks = []
|
179
|
+
for i in range(len(self.encoder_in_channels) - 1):
|
180
|
+
num_blocks = round(3 * self.depth_multiplier)
|
181
|
+
self.downsample_convs.append(
|
182
|
+
DFineSCDown(
|
183
|
+
encoder_hidden_dim=self.encoder_hidden_dim,
|
184
|
+
batch_norm_eps=self.batch_norm_eps,
|
185
|
+
kernel_size=3,
|
186
|
+
stride=2,
|
187
|
+
dtype=self.dtype_policy,
|
188
|
+
kernel_initializer=self.kernel_initializer,
|
189
|
+
bias_initializer=self.bias_initializer,
|
190
|
+
channel_axis=self.channel_axis,
|
191
|
+
name=f"downsample_conv_{i}",
|
192
|
+
)
|
193
|
+
)
|
194
|
+
self.pan_blocks.append(
|
195
|
+
DFineFeatureAggregationBlock(
|
196
|
+
encoder_hidden_dim=self.encoder_hidden_dim,
|
197
|
+
hidden_expansion=self.hidden_expansion,
|
198
|
+
batch_norm_eps=self.batch_norm_eps,
|
199
|
+
activation_function="silu",
|
200
|
+
num_blocks=num_blocks,
|
201
|
+
dtype=self.dtype_policy,
|
202
|
+
kernel_initializer=self.kernel_initializer,
|
203
|
+
bias_initializer=self.bias_initializer,
|
204
|
+
channel_axis=self.channel_axis,
|
205
|
+
name=f"pan_block_{i}",
|
206
|
+
)
|
207
|
+
)
|
208
|
+
|
209
|
+
self.upsample = keras.layers.UpSampling2D(
|
210
|
+
size=(2, 2),
|
211
|
+
interpolation="nearest",
|
212
|
+
dtype=self.dtype_policy,
|
213
|
+
data_format=self.data_format,
|
214
|
+
name="upsample",
|
215
|
+
)
|
216
|
+
self.identity = keras.layers.Identity(
|
217
|
+
dtype=self.dtype_policy, name="identity"
|
218
|
+
)
|
219
|
+
|
220
|
+
def build(self, input_shape):
|
221
|
+
inputs_embeds_shapes = input_shape
|
222
|
+
# Encoder layers.
|
223
|
+
if self.num_encoder_layers > 0:
|
224
|
+
for i, enc_ind in enumerate(self.encode_proj_layers):
|
225
|
+
feature_map_shape = inputs_embeds_shapes[enc_ind]
|
226
|
+
if self.data_format == "channels_last":
|
227
|
+
batch_s, h_s, w_s, c_s = feature_map_shape
|
228
|
+
else: # channels_first
|
229
|
+
batch_s, c_s, h_s, w_s = feature_map_shape
|
230
|
+
if h_s is not None and w_s is not None:
|
231
|
+
seq_len_for_this_encoder = h_s * w_s
|
232
|
+
else:
|
233
|
+
seq_len_for_this_encoder = None
|
234
|
+
encoder_input_shape = (batch_s, seq_len_for_this_encoder, c_s)
|
235
|
+
self.encoder[i].build(encoder_input_shape)
|
236
|
+
# FPN and PAN pathways.
|
237
|
+
# FPN (Top-down pathway).
|
238
|
+
fpn_feature_maps_shapes = [inputs_embeds_shapes[-1]]
|
239
|
+
for idx, (lateral_conv, fpn_block) in enumerate(
|
240
|
+
zip(self.lateral_convs, self.fpn_blocks)
|
241
|
+
):
|
242
|
+
lateral_conv.build(fpn_feature_maps_shapes[-1])
|
243
|
+
shape_after_lateral_conv = lateral_conv.compute_output_shape(
|
244
|
+
fpn_feature_maps_shapes[-1]
|
245
|
+
)
|
246
|
+
if self.data_format == "channels_last":
|
247
|
+
batch_s, orig_h, orig_w, c = shape_after_lateral_conv
|
248
|
+
target_h = orig_h * 2 if orig_h is not None else None
|
249
|
+
target_w = orig_w * 2 if orig_w is not None else None
|
250
|
+
shape_after_resize = (batch_s, target_h, target_w, c)
|
251
|
+
else:
|
252
|
+
batch_s, c, orig_h, orig_w = shape_after_lateral_conv
|
253
|
+
target_h = orig_h * 2 if orig_h is not None else None
|
254
|
+
target_w = orig_w * 2 if orig_w is not None else None
|
255
|
+
shape_after_resize = (batch_s, c, target_h, target_w)
|
256
|
+
backbone_feature_map_k_shape = inputs_embeds_shapes[
|
257
|
+
self.num_fpn_stages - idx - 1
|
258
|
+
]
|
259
|
+
shape_after_concat_fpn = list(shape_after_resize)
|
260
|
+
shape_after_concat_fpn[self.channel_axis] += (
|
261
|
+
backbone_feature_map_k_shape[self.channel_axis]
|
262
|
+
)
|
263
|
+
shape_after_concat_fpn = tuple(shape_after_concat_fpn)
|
264
|
+
fpn_block.build(shape_after_concat_fpn)
|
265
|
+
fpn_feature_maps_shapes.append(
|
266
|
+
fpn_block.compute_output_shape(shape_after_concat_fpn)
|
267
|
+
)
|
268
|
+
# PAN (Bottom-up pathway).
|
269
|
+
reversed_fpn_feature_maps_shapes = fpn_feature_maps_shapes[::-1]
|
270
|
+
pan_feature_maps_shapes = [reversed_fpn_feature_maps_shapes[0]]
|
271
|
+
for idx, (downsample_conv, pan_block) in enumerate(
|
272
|
+
zip(self.downsample_convs, self.pan_blocks)
|
273
|
+
):
|
274
|
+
downsample_conv.build(pan_feature_maps_shapes[-1])
|
275
|
+
shape_after_downsample = downsample_conv.compute_output_shape(
|
276
|
+
pan_feature_maps_shapes[-1]
|
277
|
+
)
|
278
|
+
fpn_shape = reversed_fpn_feature_maps_shapes[idx + 1]
|
279
|
+
concat_shape = list(shape_after_downsample)
|
280
|
+
concat_shape[self.channel_axis] += fpn_shape[self.channel_axis]
|
281
|
+
pan_block.build(tuple(concat_shape))
|
282
|
+
pan_feature_maps_shapes.append(
|
283
|
+
pan_block.compute_output_shape(tuple(concat_shape))
|
284
|
+
)
|
285
|
+
super().build(input_shape)
|
286
|
+
|
287
|
+
def call(
|
288
|
+
self,
|
289
|
+
inputs_embeds,
|
290
|
+
attention_mask=None,
|
291
|
+
output_attentions=None,
|
292
|
+
output_hidden_states=None,
|
293
|
+
training=None,
|
294
|
+
):
|
295
|
+
hidden_states = [keras.ops.convert_to_tensor(t) for t in inputs_embeds]
|
296
|
+
|
297
|
+
output_attentions = (
|
298
|
+
output_attentions if output_attentions is not None else False
|
299
|
+
)
|
300
|
+
output_hidden_states = (
|
301
|
+
output_hidden_states if output_hidden_states is not None else False
|
302
|
+
)
|
303
|
+
|
304
|
+
encoder_states_tuple = () if output_hidden_states else None
|
305
|
+
all_attentions_tuple = () if output_attentions else None
|
306
|
+
|
307
|
+
processed_maps = {}
|
308
|
+
if self.num_encoder_layers > 0:
|
309
|
+
for i, enc_ind in enumerate(self.encode_proj_layers):
|
310
|
+
current_feature_map = hidden_states[enc_ind]
|
311
|
+
if output_hidden_states:
|
312
|
+
encoder_states_tuple = encoder_states_tuple + (
|
313
|
+
self.identity(current_feature_map),
|
314
|
+
)
|
315
|
+
|
316
|
+
batch_size = keras.ops.shape(current_feature_map)[0]
|
317
|
+
if self.data_format == "channels_last":
|
318
|
+
height = keras.ops.shape(current_feature_map)[1]
|
319
|
+
width = keras.ops.shape(current_feature_map)[2]
|
320
|
+
channels = keras.ops.shape(current_feature_map)[-1]
|
321
|
+
src_flatten = keras.ops.reshape(
|
322
|
+
current_feature_map,
|
323
|
+
(batch_size, height * width, channels),
|
324
|
+
)
|
325
|
+
else:
|
326
|
+
channels = keras.ops.shape(current_feature_map)[1]
|
327
|
+
height = keras.ops.shape(current_feature_map)[2]
|
328
|
+
width = keras.ops.shape(current_feature_map)[3]
|
329
|
+
|
330
|
+
transposed_map = keras.ops.transpose(
|
331
|
+
current_feature_map, (0, 2, 3, 1)
|
332
|
+
)
|
333
|
+
src_flatten = keras.ops.reshape(
|
334
|
+
transposed_map,
|
335
|
+
(batch_size, height * width, channels),
|
336
|
+
)
|
337
|
+
|
338
|
+
pos_embed = None
|
339
|
+
if training or self.eval_size is None:
|
340
|
+
pos_embed = self.build_2d_sincos_position_embedding(
|
341
|
+
width,
|
342
|
+
height,
|
343
|
+
self.encoder_hidden_dim,
|
344
|
+
self.positional_encoding_temperature,
|
345
|
+
dtype=self.compute_dtype,
|
346
|
+
)
|
347
|
+
encoder_output = self.encoder[i](
|
348
|
+
src=src_flatten,
|
349
|
+
src_mask=attention_mask,
|
350
|
+
pos_embed=pos_embed,
|
351
|
+
output_attentions=output_attentions,
|
352
|
+
training=training,
|
353
|
+
)
|
354
|
+
if output_attentions:
|
355
|
+
processed_feature_map, layer_attentions = encoder_output
|
356
|
+
else:
|
357
|
+
processed_feature_map, layer_attentions = (
|
358
|
+
encoder_output,
|
359
|
+
None,
|
360
|
+
)
|
361
|
+
|
362
|
+
if self.data_format == "channels_last":
|
363
|
+
processed_maps[enc_ind] = keras.ops.reshape(
|
364
|
+
processed_feature_map,
|
365
|
+
(batch_size, height, width, self.encoder_hidden_dim),
|
366
|
+
)
|
367
|
+
else:
|
368
|
+
reshaped_map = keras.ops.reshape(
|
369
|
+
processed_feature_map,
|
370
|
+
(batch_size, height, width, self.encoder_hidden_dim),
|
371
|
+
)
|
372
|
+
processed_maps[enc_ind] = keras.ops.transpose(
|
373
|
+
reshaped_map, (0, 3, 1, 2)
|
374
|
+
)
|
375
|
+
|
376
|
+
if output_attentions and layer_attentions is not None:
|
377
|
+
all_attentions_tuple = all_attentions_tuple + (
|
378
|
+
layer_attentions,
|
379
|
+
)
|
380
|
+
|
381
|
+
processed_hidden_states = []
|
382
|
+
for i in range(len(hidden_states)):
|
383
|
+
if i in processed_maps:
|
384
|
+
processed_hidden_states.append(processed_maps[i])
|
385
|
+
else:
|
386
|
+
processed_hidden_states.append(hidden_states[i])
|
387
|
+
if self.num_encoder_layers > 0:
|
388
|
+
if output_hidden_states:
|
389
|
+
encoder_states_tuple = encoder_states_tuple + (
|
390
|
+
self.identity(
|
391
|
+
processed_hidden_states[self.encode_proj_layers[-1]]
|
392
|
+
),
|
393
|
+
)
|
394
|
+
else:
|
395
|
+
processed_hidden_states = hidden_states
|
396
|
+
fpn_inter_outputs = []
|
397
|
+
y = processed_hidden_states[-1]
|
398
|
+
for idx, (lateral_conv, fpn_block) in enumerate(
|
399
|
+
zip(self.lateral_convs, self.fpn_blocks)
|
400
|
+
):
|
401
|
+
backbone_feature_map_k = processed_hidden_states[
|
402
|
+
self.num_fpn_stages - idx - 1
|
403
|
+
]
|
404
|
+
y_lateral = lateral_conv(y, training=training)
|
405
|
+
fpn_inter_outputs.append(y_lateral)
|
406
|
+
y_upsampled = self.upsample(y_lateral, training=training)
|
407
|
+
fused_feature_map_k = keras.ops.concatenate(
|
408
|
+
[y_upsampled, backbone_feature_map_k],
|
409
|
+
axis=self.channel_axis,
|
410
|
+
)
|
411
|
+
y = fpn_block(fused_feature_map_k, training=training)
|
412
|
+
fpn_feature_maps = fpn_inter_outputs + [y]
|
413
|
+
|
414
|
+
fpn_feature_maps = fpn_feature_maps[::-1]
|
415
|
+
|
416
|
+
pan_feature_maps = [fpn_feature_maps[0]]
|
417
|
+
for idx, (downsample_conv, pan_block) in enumerate(
|
418
|
+
zip(self.downsample_convs, self.pan_blocks)
|
419
|
+
):
|
420
|
+
top_pan_feature_map_k = pan_feature_maps[-1]
|
421
|
+
fpn_feature_map_k = fpn_feature_maps[idx + 1]
|
422
|
+
|
423
|
+
downsampled_feature_map_k = downsample_conv(
|
424
|
+
top_pan_feature_map_k, training=training
|
425
|
+
)
|
426
|
+
fused_feature_map_k = keras.ops.concatenate(
|
427
|
+
[downsampled_feature_map_k, fpn_feature_map_k],
|
428
|
+
axis=self.channel_axis,
|
429
|
+
)
|
430
|
+
new_pan_feature_map_k = pan_block(
|
431
|
+
fused_feature_map_k, training=training
|
432
|
+
)
|
433
|
+
pan_feature_maps.append(new_pan_feature_map_k)
|
434
|
+
|
435
|
+
return tuple(
|
436
|
+
v
|
437
|
+
for v in [
|
438
|
+
pan_feature_maps,
|
439
|
+
encoder_states_tuple if output_hidden_states else None,
|
440
|
+
all_attentions_tuple if output_attentions else None,
|
441
|
+
]
|
442
|
+
if v is not None
|
443
|
+
)
|
444
|
+
|
445
|
+
@staticmethod
|
446
|
+
def build_2d_sincos_position_embedding(
|
447
|
+
width,
|
448
|
+
height,
|
449
|
+
embedding_dim=256,
|
450
|
+
temperature=10000.0,
|
451
|
+
dtype="float32",
|
452
|
+
):
|
453
|
+
grid_w = keras.ops.arange(width, dtype=dtype)
|
454
|
+
grid_h = keras.ops.arange(height, dtype=dtype)
|
455
|
+
grid_w, grid_h = keras.ops.meshgrid(grid_w, grid_h, indexing="ij")
|
456
|
+
if embedding_dim % 4 != 0:
|
457
|
+
raise ValueError(
|
458
|
+
"Embed dimension must be divisible by 4 for 2D sin-cos position"
|
459
|
+
" embedding"
|
460
|
+
)
|
461
|
+
pos_dim = embedding_dim // 4
|
462
|
+
omega = keras.ops.arange(pos_dim, dtype=dtype) / pos_dim
|
463
|
+
omega = 1.0 / (temperature**omega)
|
464
|
+
|
465
|
+
out_w = keras.ops.matmul(
|
466
|
+
keras.ops.reshape(grid_w, (-1, 1)),
|
467
|
+
keras.ops.reshape(omega, (1, -1)),
|
468
|
+
)
|
469
|
+
out_h = keras.ops.matmul(
|
470
|
+
keras.ops.reshape(grid_h, (-1, 1)),
|
471
|
+
keras.ops.reshape(omega, (1, -1)),
|
472
|
+
)
|
473
|
+
|
474
|
+
concatenated_embeds = keras.ops.concatenate(
|
475
|
+
[
|
476
|
+
keras.ops.sin(out_w),
|
477
|
+
keras.ops.cos(out_w),
|
478
|
+
keras.ops.sin(out_h),
|
479
|
+
keras.ops.cos(out_h),
|
480
|
+
],
|
481
|
+
axis=1,
|
482
|
+
)
|
483
|
+
return keras.ops.expand_dims(concatenated_embeds, axis=0)
|
484
|
+
|
485
|
+
def get_config(self):
|
486
|
+
config = super().get_config()
|
487
|
+
config.update(
|
488
|
+
{
|
489
|
+
"encoder_in_channels": self.encoder_in_channels,
|
490
|
+
"feat_strides": self.feat_strides,
|
491
|
+
"encoder_hidden_dim": self.encoder_hidden_dim,
|
492
|
+
"encode_proj_layers": self.encode_proj_layers,
|
493
|
+
"positional_encoding_temperature": self.positional_encoding_temperature, # noqa: E501
|
494
|
+
"eval_size": self.eval_size,
|
495
|
+
"normalize_before": self.normalize_before,
|
496
|
+
"num_attention_heads": self.num_attention_heads,
|
497
|
+
"dropout": self.dropout_rate,
|
498
|
+
"layer_norm_eps": self.layer_norm_eps,
|
499
|
+
"encoder_activation_function": self.encoder_activation_function,
|
500
|
+
"activation_dropout": self.activation_dropout_rate,
|
501
|
+
"encoder_ffn_dim": self.encoder_ffn_dim,
|
502
|
+
"num_encoder_layers": self.num_encoder_layers,
|
503
|
+
"batch_norm_eps": self.batch_norm_eps,
|
504
|
+
"hidden_expansion": self.hidden_expansion,
|
505
|
+
"depth_multiplier": self.depth_multiplier,
|
506
|
+
"kernel_initializer": self.kernel_initializer,
|
507
|
+
"bias_initializer": self.bias_initializer,
|
508
|
+
"channel_axis": self.channel_axis,
|
509
|
+
"data_format": self.data_format,
|
510
|
+
}
|
511
|
+
)
|
512
|
+
return config
|
513
|
+
|
514
|
+
def compute_output_spec(
|
515
|
+
self,
|
516
|
+
inputs_embeds,
|
517
|
+
attention_mask_spec=None,
|
518
|
+
output_attentions=None,
|
519
|
+
output_hidden_states=None,
|
520
|
+
training=None,
|
521
|
+
):
|
522
|
+
output_attentions = (
|
523
|
+
output_attentions if output_attentions is not None else False
|
524
|
+
)
|
525
|
+
output_hidden_states = (
|
526
|
+
output_hidden_states if output_hidden_states is not None else False
|
527
|
+
)
|
528
|
+
hidden_states_specs = list(inputs_embeds)
|
529
|
+
encoder_states_tuple_specs = () if output_hidden_states else None
|
530
|
+
all_attentions_tuple_specs = () if output_attentions else None
|
531
|
+
processed_maps_specs = {}
|
532
|
+
if self.num_encoder_layers > 0:
|
533
|
+
for i, enc_ind in enumerate(self.encode_proj_layers):
|
534
|
+
current_feature_map_spec = hidden_states_specs[enc_ind]
|
535
|
+
if output_hidden_states:
|
536
|
+
encoder_states_tuple_specs += (
|
537
|
+
self.identity(current_feature_map_spec),
|
538
|
+
)
|
539
|
+
if self.data_format == "channels_last":
|
540
|
+
batch_size, h, w, c = current_feature_map_spec.shape
|
541
|
+
else:
|
542
|
+
batch_size, c, h, w = current_feature_map_spec.shape
|
543
|
+
seq_len = h * w if h is not None and w is not None else None
|
544
|
+
src_flatten_spec = keras.KerasTensor(
|
545
|
+
(batch_size, seq_len, c), dtype=self.compute_dtype
|
546
|
+
)
|
547
|
+
pos_embed_spec = keras.KerasTensor(
|
548
|
+
(batch_size, seq_len, self.encoder_hidden_dim),
|
549
|
+
dtype=self.compute_dtype,
|
550
|
+
)
|
551
|
+
encoder_output_spec = self.encoder[i].compute_output_spec(
|
552
|
+
src=src_flatten_spec,
|
553
|
+
src_mask=attention_mask_spec,
|
554
|
+
pos_embed=pos_embed_spec,
|
555
|
+
output_attentions=output_attentions,
|
556
|
+
)
|
557
|
+
if output_attentions:
|
558
|
+
_, layer_attentions_spec = encoder_output_spec
|
559
|
+
all_attentions_tuple_specs += (layer_attentions_spec,)
|
560
|
+
if self.data_format == "channels_last":
|
561
|
+
processed_maps_specs[enc_ind] = keras.KerasTensor(
|
562
|
+
(batch_size, h, w, self.encoder_hidden_dim),
|
563
|
+
dtype=self.compute_dtype,
|
564
|
+
)
|
565
|
+
else:
|
566
|
+
processed_maps_specs[enc_ind] = keras.KerasTensor(
|
567
|
+
(batch_size, self.encoder_hidden_dim, h, w),
|
568
|
+
dtype=self.compute_dtype,
|
569
|
+
)
|
570
|
+
processed_hidden_states_specs = []
|
571
|
+
for i in range(len(hidden_states_specs)):
|
572
|
+
if i in processed_maps_specs:
|
573
|
+
processed_hidden_states_specs.append(processed_maps_specs[i])
|
574
|
+
else:
|
575
|
+
processed_hidden_states_specs.append(hidden_states_specs[i])
|
576
|
+
if self.num_encoder_layers > 0:
|
577
|
+
if output_hidden_states:
|
578
|
+
encoder_states_tuple_specs += (
|
579
|
+
self.identity(
|
580
|
+
processed_hidden_states_specs[
|
581
|
+
self.encode_proj_layers[-1]
|
582
|
+
]
|
583
|
+
),
|
584
|
+
)
|
585
|
+
else:
|
586
|
+
processed_hidden_states_specs = hidden_states_specs
|
587
|
+
fpn_inter_outputs_specs = []
|
588
|
+
y_spec = processed_hidden_states_specs[-1]
|
589
|
+
for idx, (lateral_conv, fpn_block) in enumerate(
|
590
|
+
zip(self.lateral_convs, self.fpn_blocks)
|
591
|
+
):
|
592
|
+
backbone_feature_map_k_spec = processed_hidden_states_specs[
|
593
|
+
self.num_fpn_stages - idx - 1
|
594
|
+
]
|
595
|
+
y_lateral_spec = keras.KerasTensor(
|
596
|
+
lateral_conv.compute_output_shape(y_spec.shape),
|
597
|
+
dtype=self.compute_dtype,
|
598
|
+
)
|
599
|
+
fpn_inter_outputs_specs.append(y_lateral_spec)
|
600
|
+
y_upsampled_spec = keras.KerasTensor(
|
601
|
+
self.upsample.compute_output_shape(y_lateral_spec.shape),
|
602
|
+
dtype=self.compute_dtype,
|
603
|
+
)
|
604
|
+
concat_shape = list(y_upsampled_spec.shape)
|
605
|
+
concat_shape[self.channel_axis] += (
|
606
|
+
backbone_feature_map_k_spec.shape[self.channel_axis]
|
607
|
+
)
|
608
|
+
y_spec = keras.KerasTensor(
|
609
|
+
fpn_block.compute_output_shape(tuple(concat_shape)),
|
610
|
+
dtype=self.compute_dtype,
|
611
|
+
)
|
612
|
+
fpn_feature_maps_specs = fpn_inter_outputs_specs + [y_spec]
|
613
|
+
fpn_feature_maps_specs = fpn_feature_maps_specs[::-1]
|
614
|
+
pan_feature_maps_specs = [fpn_feature_maps_specs[0]]
|
615
|
+
for idx, (downsample_conv, pan_block) in enumerate(
|
616
|
+
zip(self.downsample_convs, self.pan_blocks)
|
617
|
+
):
|
618
|
+
top_pan_feature_map_k_spec = pan_feature_maps_specs[-1]
|
619
|
+
fpn_feature_map_k_spec = fpn_feature_maps_specs[idx + 1]
|
620
|
+
downsampled_feature_map_k_spec = keras.KerasTensor(
|
621
|
+
downsample_conv.compute_output_shape(
|
622
|
+
top_pan_feature_map_k_spec.shape
|
623
|
+
),
|
624
|
+
dtype=self.compute_dtype,
|
625
|
+
)
|
626
|
+
concat_shape = list(downsampled_feature_map_k_spec.shape)
|
627
|
+
concat_shape[self.channel_axis] += fpn_feature_map_k_spec.shape[
|
628
|
+
self.channel_axis
|
629
|
+
]
|
630
|
+
new_pan_feature_map_k_spec = keras.KerasTensor(
|
631
|
+
pan_block.compute_output_shape(tuple(concat_shape)),
|
632
|
+
dtype=self.compute_dtype,
|
633
|
+
)
|
634
|
+
pan_feature_maps_specs.append(new_pan_feature_map_k_spec)
|
635
|
+
outputs = [
|
636
|
+
tuple(pan_feature_maps_specs),
|
637
|
+
]
|
638
|
+
if output_hidden_states:
|
639
|
+
outputs.append(encoder_states_tuple_specs)
|
640
|
+
if output_attentions:
|
641
|
+
outputs.append(all_attentions_tuple_specs)
|
642
|
+
return tuple(outputs) if len(outputs) > 1 else outputs[0]
|
@@ -0,0 +1,8 @@
|
|
1
|
+
from keras_hub.src.api_export import keras_hub_export
|
2
|
+
from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
|
3
|
+
from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
|
4
|
+
|
5
|
+
|
6
|
+
@keras_hub_export("keras_hub.layers.DFineImageConverter")
|
7
|
+
class DFineImageConverter(ImageConverter):
|
8
|
+
backbone_cls = DFineBackbone
|