keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. keras_hub/layers/__init__.py +6 -0
  2. keras_hub/models/__init__.py +21 -0
  3. keras_hub/src/layers/modeling/position_embedding.py +21 -6
  4. keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
  5. keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
  6. keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
  7. keras_hub/src/models/backbone.py +10 -15
  8. keras_hub/src/models/d_fine/__init__.py +0 -0
  9. keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
  10. keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
  11. keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
  12. keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
  13. keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
  14. keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
  15. keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
  16. keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
  17. keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
  18. keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
  19. keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
  20. keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
  21. keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
  22. keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
  23. keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
  24. keras_hub/src/models/parseq/__init__.py +0 -0
  25. keras_hub/src/models/parseq/parseq_backbone.py +134 -0
  26. keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
  27. keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
  28. keras_hub/src/models/parseq/parseq_decoder.py +418 -0
  29. keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
  30. keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
  31. keras_hub/src/tests/test_case.py +37 -1
  32. keras_hub/src/utils/preset_utils.py +49 -0
  33. keras_hub/src/utils/tensor_utils.py +23 -1
  34. keras_hub/src/utils/transformers/convert_vit.py +4 -1
  35. keras_hub/src/version.py +1 -1
  36. keras_hub/tokenizers/__init__.py +3 -0
  37. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
  38. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
  39. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
  40. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,642 @@
1
+ import keras
2
+
3
+ from keras_hub.src.models.d_fine.d_fine_encoder import DFineEncoder
4
+ from keras_hub.src.models.d_fine.d_fine_layers import DFineConvNormLayer
5
+ from keras_hub.src.models.d_fine.d_fine_layers import (
6
+ DFineFeatureAggregationBlock,
7
+ )
8
+ from keras_hub.src.models.d_fine.d_fine_layers import DFineSCDown
9
+
10
+
11
+ class DFineHybridEncoder(keras.layers.Layer):
12
+ """Hybrid encoder for the D-FINE model.
13
+
14
+ This layer sits between the HGNetV2 backbone (`HGNetV2Backbone`) and the
15
+ main `DFineDecoder`. It takes multi-scale feature maps from the backbone,
16
+ optionally refines them with transformer-based `DFineEncoder` layers, and
17
+ then fuses them using a Feature Pyramid Network (FPN) top-down pathway and a
18
+ Path Aggregation Network (PAN) bottom-up pathway. The resulting enriched
19
+ feature maps serve as the key and value inputs for the decoder's
20
+ cross-attention mechanism.
21
+
22
+ Args:
23
+ encoder_in_channels: list of int, Input channel dimensions for each
24
+ feature level from the backbone.
25
+ feat_strides: list of int, Stride values for each feature level,
26
+ indicating the downsampling factor relative to the input image.
27
+ encoder_hidden_dim: int, Hidden dimension size used throughout the
28
+ encoder for feature projection and attention computation.
29
+ encode_proj_layers: list of int, Indices of feature levels to apply
30
+ transformer encoding to. Not all levels need transformer
31
+ processing.
32
+ positional_encoding_temperature: float, Temperature parameter for
33
+ sinusoidal positional embeddings used in transformer attention.
34
+ eval_size: tuple or None, Fixed evaluation size `(height, width)` for
35
+ consistent positional embeddings during inference. If `None`,
36
+ dynamic sizing is used.
37
+ normalize_before: bool, Whether to apply layer normalization before
38
+ attention and feed-forward operations in transformer layers.
39
+ num_attention_heads: int, Number of attention heads in multi-head
40
+ attention mechanisms within transformer layers.
41
+ dropout: float, Dropout probability applied to attention weights and
42
+ feed-forward networks for regularization.
43
+ layer_norm_eps: float, Small epsilon value for numerical stability in
44
+ layer normalization operations.
45
+ encoder_activation_function: str, Activation function used in
46
+ transformer feed-forward networks (e.g., `"relu"`, `"gelu"`).
47
+ activation_dropout: float, Dropout probability specifically applied to
48
+ activation functions in feed-forward networks.
49
+ encoder_ffn_dim: int, Hidden dimension size for feed-forward networks
50
+ within transformer layers.
51
+ num_encoder_layers: int, Number of transformer encoder layers to apply
52
+ at each selected feature level.
53
+ batch_norm_eps: float, Small epsilon value for numerical stability in
54
+ batch normalization operations used in components.
55
+ hidden_expansion: float, Expansion factor for hidden dimensions in
56
+ `DFineFeatureAggregationBlock` blocks used in FPN and PAN pathways.
57
+ depth_multiplier: float, Depth multiplier for scaling the number of
58
+ blocks in `DFineFeatureAggregationBlock` modules.
59
+ kernel_initializer: str or Initializer, optional, Initializer for
60
+ the kernel weights of each layer. Defaults to
61
+ `"glorot_uniform"`.
62
+ bias_initializer: str or Initializer, optional, Initializer for
63
+ the bias weights of each layer. Defaults to
64
+ `"zeros"`.
65
+ channel_axis: int, optional, The channel axis. Defaults to `None`.
66
+ data_format: str, optional, The data format. Defaults to `None`.
67
+ **kwargs: Additional keyword arguments passed to the parent class.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ encoder_in_channels,
73
+ feat_strides,
74
+ encoder_hidden_dim,
75
+ encode_proj_layers,
76
+ positional_encoding_temperature,
77
+ eval_size,
78
+ normalize_before,
79
+ num_attention_heads,
80
+ dropout,
81
+ layer_norm_eps,
82
+ encoder_activation_function,
83
+ activation_dropout,
84
+ encoder_ffn_dim,
85
+ num_encoder_layers,
86
+ batch_norm_eps,
87
+ hidden_expansion,
88
+ depth_multiplier,
89
+ kernel_initializer="glorot_uniform",
90
+ bias_initializer="zeros",
91
+ channel_axis=None,
92
+ data_format=None,
93
+ dtype=None,
94
+ **kwargs,
95
+ ):
96
+ super().__init__(dtype=dtype, **kwargs)
97
+
98
+ self.encoder_in_channels = encoder_in_channels
99
+ self.num_fpn_stages = len(self.encoder_in_channels) - 1
100
+ self.feat_strides = feat_strides
101
+ self.encoder_hidden_dim = encoder_hidden_dim
102
+ self.encode_proj_layers = encode_proj_layers
103
+ self.positional_encoding_temperature = positional_encoding_temperature
104
+ self.eval_size = eval_size
105
+ self.out_channels = [
106
+ self.encoder_hidden_dim for _ in self.encoder_in_channels
107
+ ]
108
+ self.out_strides = self.feat_strides
109
+ self.depth_multiplier = depth_multiplier
110
+ self.num_encoder_layers = num_encoder_layers
111
+ self.normalize_before = normalize_before
112
+ self.num_attention_heads = num_attention_heads
113
+ self.dropout_rate = dropout
114
+ self.layer_norm_eps = layer_norm_eps
115
+ self.encoder_activation_function = encoder_activation_function
116
+ self.activation_dropout_rate = activation_dropout
117
+ self.encoder_ffn_dim = encoder_ffn_dim
118
+ self.batch_norm_eps = batch_norm_eps
119
+ self.hidden_expansion = hidden_expansion
120
+ self.kernel_initializer = kernel_initializer
121
+ self.bias_initializer = bias_initializer
122
+ self.channel_axis = channel_axis
123
+ self.data_format = data_format
124
+
125
+ self.encoder = [
126
+ DFineEncoder(
127
+ normalize_before=self.normalize_before,
128
+ encoder_hidden_dim=self.encoder_hidden_dim,
129
+ num_attention_heads=self.num_attention_heads,
130
+ dropout=self.dropout_rate,
131
+ layer_norm_eps=self.layer_norm_eps,
132
+ encoder_activation_function=self.encoder_activation_function,
133
+ activation_dropout=self.activation_dropout_rate,
134
+ encoder_ffn_dim=self.encoder_ffn_dim,
135
+ dtype=self.dtype_policy,
136
+ num_encoder_layers=self.num_encoder_layers,
137
+ kernel_initializer=self.kernel_initializer,
138
+ bias_initializer=self.bias_initializer,
139
+ name=f"d_fine_encoder_{i}",
140
+ )
141
+ for i in range(len(self.encode_proj_layers))
142
+ ]
143
+
144
+ self.lateral_convs = []
145
+ self.fpn_blocks = []
146
+ for i in range(len(self.encoder_in_channels) - 1, 0, -1):
147
+ lateral_layer = DFineConvNormLayer(
148
+ filters=self.encoder_hidden_dim,
149
+ kernel_size=1,
150
+ batch_norm_eps=self.batch_norm_eps,
151
+ stride=1,
152
+ groups=1,
153
+ padding=0,
154
+ activation_function=None,
155
+ dtype=self.dtype_policy,
156
+ kernel_initializer=self.kernel_initializer,
157
+ bias_initializer=self.bias_initializer,
158
+ channel_axis=self.channel_axis,
159
+ name=f"lateral_conv_{i}",
160
+ )
161
+ self.lateral_convs.append(lateral_layer)
162
+ num_blocks = round(3 * self.depth_multiplier)
163
+ fpn_layer = DFineFeatureAggregationBlock(
164
+ encoder_hidden_dim=self.encoder_hidden_dim,
165
+ hidden_expansion=self.hidden_expansion,
166
+ batch_norm_eps=self.batch_norm_eps,
167
+ activation_function="silu",
168
+ num_blocks=num_blocks,
169
+ dtype=self.dtype_policy,
170
+ kernel_initializer=self.kernel_initializer,
171
+ bias_initializer=self.bias_initializer,
172
+ channel_axis=self.channel_axis,
173
+ name=f"fpn_block_{i}",
174
+ )
175
+ self.fpn_blocks.append(fpn_layer)
176
+
177
+ self.downsample_convs = []
178
+ self.pan_blocks = []
179
+ for i in range(len(self.encoder_in_channels) - 1):
180
+ num_blocks = round(3 * self.depth_multiplier)
181
+ self.downsample_convs.append(
182
+ DFineSCDown(
183
+ encoder_hidden_dim=self.encoder_hidden_dim,
184
+ batch_norm_eps=self.batch_norm_eps,
185
+ kernel_size=3,
186
+ stride=2,
187
+ dtype=self.dtype_policy,
188
+ kernel_initializer=self.kernel_initializer,
189
+ bias_initializer=self.bias_initializer,
190
+ channel_axis=self.channel_axis,
191
+ name=f"downsample_conv_{i}",
192
+ )
193
+ )
194
+ self.pan_blocks.append(
195
+ DFineFeatureAggregationBlock(
196
+ encoder_hidden_dim=self.encoder_hidden_dim,
197
+ hidden_expansion=self.hidden_expansion,
198
+ batch_norm_eps=self.batch_norm_eps,
199
+ activation_function="silu",
200
+ num_blocks=num_blocks,
201
+ dtype=self.dtype_policy,
202
+ kernel_initializer=self.kernel_initializer,
203
+ bias_initializer=self.bias_initializer,
204
+ channel_axis=self.channel_axis,
205
+ name=f"pan_block_{i}",
206
+ )
207
+ )
208
+
209
+ self.upsample = keras.layers.UpSampling2D(
210
+ size=(2, 2),
211
+ interpolation="nearest",
212
+ dtype=self.dtype_policy,
213
+ data_format=self.data_format,
214
+ name="upsample",
215
+ )
216
+ self.identity = keras.layers.Identity(
217
+ dtype=self.dtype_policy, name="identity"
218
+ )
219
+
220
+ def build(self, input_shape):
221
+ inputs_embeds_shapes = input_shape
222
+ # Encoder layers.
223
+ if self.num_encoder_layers > 0:
224
+ for i, enc_ind in enumerate(self.encode_proj_layers):
225
+ feature_map_shape = inputs_embeds_shapes[enc_ind]
226
+ if self.data_format == "channels_last":
227
+ batch_s, h_s, w_s, c_s = feature_map_shape
228
+ else: # channels_first
229
+ batch_s, c_s, h_s, w_s = feature_map_shape
230
+ if h_s is not None and w_s is not None:
231
+ seq_len_for_this_encoder = h_s * w_s
232
+ else:
233
+ seq_len_for_this_encoder = None
234
+ encoder_input_shape = (batch_s, seq_len_for_this_encoder, c_s)
235
+ self.encoder[i].build(encoder_input_shape)
236
+ # FPN and PAN pathways.
237
+ # FPN (Top-down pathway).
238
+ fpn_feature_maps_shapes = [inputs_embeds_shapes[-1]]
239
+ for idx, (lateral_conv, fpn_block) in enumerate(
240
+ zip(self.lateral_convs, self.fpn_blocks)
241
+ ):
242
+ lateral_conv.build(fpn_feature_maps_shapes[-1])
243
+ shape_after_lateral_conv = lateral_conv.compute_output_shape(
244
+ fpn_feature_maps_shapes[-1]
245
+ )
246
+ if self.data_format == "channels_last":
247
+ batch_s, orig_h, orig_w, c = shape_after_lateral_conv
248
+ target_h = orig_h * 2 if orig_h is not None else None
249
+ target_w = orig_w * 2 if orig_w is not None else None
250
+ shape_after_resize = (batch_s, target_h, target_w, c)
251
+ else:
252
+ batch_s, c, orig_h, orig_w = shape_after_lateral_conv
253
+ target_h = orig_h * 2 if orig_h is not None else None
254
+ target_w = orig_w * 2 if orig_w is not None else None
255
+ shape_after_resize = (batch_s, c, target_h, target_w)
256
+ backbone_feature_map_k_shape = inputs_embeds_shapes[
257
+ self.num_fpn_stages - idx - 1
258
+ ]
259
+ shape_after_concat_fpn = list(shape_after_resize)
260
+ shape_after_concat_fpn[self.channel_axis] += (
261
+ backbone_feature_map_k_shape[self.channel_axis]
262
+ )
263
+ shape_after_concat_fpn = tuple(shape_after_concat_fpn)
264
+ fpn_block.build(shape_after_concat_fpn)
265
+ fpn_feature_maps_shapes.append(
266
+ fpn_block.compute_output_shape(shape_after_concat_fpn)
267
+ )
268
+ # PAN (Bottom-up pathway).
269
+ reversed_fpn_feature_maps_shapes = fpn_feature_maps_shapes[::-1]
270
+ pan_feature_maps_shapes = [reversed_fpn_feature_maps_shapes[0]]
271
+ for idx, (downsample_conv, pan_block) in enumerate(
272
+ zip(self.downsample_convs, self.pan_blocks)
273
+ ):
274
+ downsample_conv.build(pan_feature_maps_shapes[-1])
275
+ shape_after_downsample = downsample_conv.compute_output_shape(
276
+ pan_feature_maps_shapes[-1]
277
+ )
278
+ fpn_shape = reversed_fpn_feature_maps_shapes[idx + 1]
279
+ concat_shape = list(shape_after_downsample)
280
+ concat_shape[self.channel_axis] += fpn_shape[self.channel_axis]
281
+ pan_block.build(tuple(concat_shape))
282
+ pan_feature_maps_shapes.append(
283
+ pan_block.compute_output_shape(tuple(concat_shape))
284
+ )
285
+ super().build(input_shape)
286
+
287
+ def call(
288
+ self,
289
+ inputs_embeds,
290
+ attention_mask=None,
291
+ output_attentions=None,
292
+ output_hidden_states=None,
293
+ training=None,
294
+ ):
295
+ hidden_states = [keras.ops.convert_to_tensor(t) for t in inputs_embeds]
296
+
297
+ output_attentions = (
298
+ output_attentions if output_attentions is not None else False
299
+ )
300
+ output_hidden_states = (
301
+ output_hidden_states if output_hidden_states is not None else False
302
+ )
303
+
304
+ encoder_states_tuple = () if output_hidden_states else None
305
+ all_attentions_tuple = () if output_attentions else None
306
+
307
+ processed_maps = {}
308
+ if self.num_encoder_layers > 0:
309
+ for i, enc_ind in enumerate(self.encode_proj_layers):
310
+ current_feature_map = hidden_states[enc_ind]
311
+ if output_hidden_states:
312
+ encoder_states_tuple = encoder_states_tuple + (
313
+ self.identity(current_feature_map),
314
+ )
315
+
316
+ batch_size = keras.ops.shape(current_feature_map)[0]
317
+ if self.data_format == "channels_last":
318
+ height = keras.ops.shape(current_feature_map)[1]
319
+ width = keras.ops.shape(current_feature_map)[2]
320
+ channels = keras.ops.shape(current_feature_map)[-1]
321
+ src_flatten = keras.ops.reshape(
322
+ current_feature_map,
323
+ (batch_size, height * width, channels),
324
+ )
325
+ else:
326
+ channels = keras.ops.shape(current_feature_map)[1]
327
+ height = keras.ops.shape(current_feature_map)[2]
328
+ width = keras.ops.shape(current_feature_map)[3]
329
+
330
+ transposed_map = keras.ops.transpose(
331
+ current_feature_map, (0, 2, 3, 1)
332
+ )
333
+ src_flatten = keras.ops.reshape(
334
+ transposed_map,
335
+ (batch_size, height * width, channels),
336
+ )
337
+
338
+ pos_embed = None
339
+ if training or self.eval_size is None:
340
+ pos_embed = self.build_2d_sincos_position_embedding(
341
+ width,
342
+ height,
343
+ self.encoder_hidden_dim,
344
+ self.positional_encoding_temperature,
345
+ dtype=self.compute_dtype,
346
+ )
347
+ encoder_output = self.encoder[i](
348
+ src=src_flatten,
349
+ src_mask=attention_mask,
350
+ pos_embed=pos_embed,
351
+ output_attentions=output_attentions,
352
+ training=training,
353
+ )
354
+ if output_attentions:
355
+ processed_feature_map, layer_attentions = encoder_output
356
+ else:
357
+ processed_feature_map, layer_attentions = (
358
+ encoder_output,
359
+ None,
360
+ )
361
+
362
+ if self.data_format == "channels_last":
363
+ processed_maps[enc_ind] = keras.ops.reshape(
364
+ processed_feature_map,
365
+ (batch_size, height, width, self.encoder_hidden_dim),
366
+ )
367
+ else:
368
+ reshaped_map = keras.ops.reshape(
369
+ processed_feature_map,
370
+ (batch_size, height, width, self.encoder_hidden_dim),
371
+ )
372
+ processed_maps[enc_ind] = keras.ops.transpose(
373
+ reshaped_map, (0, 3, 1, 2)
374
+ )
375
+
376
+ if output_attentions and layer_attentions is not None:
377
+ all_attentions_tuple = all_attentions_tuple + (
378
+ layer_attentions,
379
+ )
380
+
381
+ processed_hidden_states = []
382
+ for i in range(len(hidden_states)):
383
+ if i in processed_maps:
384
+ processed_hidden_states.append(processed_maps[i])
385
+ else:
386
+ processed_hidden_states.append(hidden_states[i])
387
+ if self.num_encoder_layers > 0:
388
+ if output_hidden_states:
389
+ encoder_states_tuple = encoder_states_tuple + (
390
+ self.identity(
391
+ processed_hidden_states[self.encode_proj_layers[-1]]
392
+ ),
393
+ )
394
+ else:
395
+ processed_hidden_states = hidden_states
396
+ fpn_inter_outputs = []
397
+ y = processed_hidden_states[-1]
398
+ for idx, (lateral_conv, fpn_block) in enumerate(
399
+ zip(self.lateral_convs, self.fpn_blocks)
400
+ ):
401
+ backbone_feature_map_k = processed_hidden_states[
402
+ self.num_fpn_stages - idx - 1
403
+ ]
404
+ y_lateral = lateral_conv(y, training=training)
405
+ fpn_inter_outputs.append(y_lateral)
406
+ y_upsampled = self.upsample(y_lateral, training=training)
407
+ fused_feature_map_k = keras.ops.concatenate(
408
+ [y_upsampled, backbone_feature_map_k],
409
+ axis=self.channel_axis,
410
+ )
411
+ y = fpn_block(fused_feature_map_k, training=training)
412
+ fpn_feature_maps = fpn_inter_outputs + [y]
413
+
414
+ fpn_feature_maps = fpn_feature_maps[::-1]
415
+
416
+ pan_feature_maps = [fpn_feature_maps[0]]
417
+ for idx, (downsample_conv, pan_block) in enumerate(
418
+ zip(self.downsample_convs, self.pan_blocks)
419
+ ):
420
+ top_pan_feature_map_k = pan_feature_maps[-1]
421
+ fpn_feature_map_k = fpn_feature_maps[idx + 1]
422
+
423
+ downsampled_feature_map_k = downsample_conv(
424
+ top_pan_feature_map_k, training=training
425
+ )
426
+ fused_feature_map_k = keras.ops.concatenate(
427
+ [downsampled_feature_map_k, fpn_feature_map_k],
428
+ axis=self.channel_axis,
429
+ )
430
+ new_pan_feature_map_k = pan_block(
431
+ fused_feature_map_k, training=training
432
+ )
433
+ pan_feature_maps.append(new_pan_feature_map_k)
434
+
435
+ return tuple(
436
+ v
437
+ for v in [
438
+ pan_feature_maps,
439
+ encoder_states_tuple if output_hidden_states else None,
440
+ all_attentions_tuple if output_attentions else None,
441
+ ]
442
+ if v is not None
443
+ )
444
+
445
+ @staticmethod
446
+ def build_2d_sincos_position_embedding(
447
+ width,
448
+ height,
449
+ embedding_dim=256,
450
+ temperature=10000.0,
451
+ dtype="float32",
452
+ ):
453
+ grid_w = keras.ops.arange(width, dtype=dtype)
454
+ grid_h = keras.ops.arange(height, dtype=dtype)
455
+ grid_w, grid_h = keras.ops.meshgrid(grid_w, grid_h, indexing="ij")
456
+ if embedding_dim % 4 != 0:
457
+ raise ValueError(
458
+ "Embed dimension must be divisible by 4 for 2D sin-cos position"
459
+ " embedding"
460
+ )
461
+ pos_dim = embedding_dim // 4
462
+ omega = keras.ops.arange(pos_dim, dtype=dtype) / pos_dim
463
+ omega = 1.0 / (temperature**omega)
464
+
465
+ out_w = keras.ops.matmul(
466
+ keras.ops.reshape(grid_w, (-1, 1)),
467
+ keras.ops.reshape(omega, (1, -1)),
468
+ )
469
+ out_h = keras.ops.matmul(
470
+ keras.ops.reshape(grid_h, (-1, 1)),
471
+ keras.ops.reshape(omega, (1, -1)),
472
+ )
473
+
474
+ concatenated_embeds = keras.ops.concatenate(
475
+ [
476
+ keras.ops.sin(out_w),
477
+ keras.ops.cos(out_w),
478
+ keras.ops.sin(out_h),
479
+ keras.ops.cos(out_h),
480
+ ],
481
+ axis=1,
482
+ )
483
+ return keras.ops.expand_dims(concatenated_embeds, axis=0)
484
+
485
+ def get_config(self):
486
+ config = super().get_config()
487
+ config.update(
488
+ {
489
+ "encoder_in_channels": self.encoder_in_channels,
490
+ "feat_strides": self.feat_strides,
491
+ "encoder_hidden_dim": self.encoder_hidden_dim,
492
+ "encode_proj_layers": self.encode_proj_layers,
493
+ "positional_encoding_temperature": self.positional_encoding_temperature, # noqa: E501
494
+ "eval_size": self.eval_size,
495
+ "normalize_before": self.normalize_before,
496
+ "num_attention_heads": self.num_attention_heads,
497
+ "dropout": self.dropout_rate,
498
+ "layer_norm_eps": self.layer_norm_eps,
499
+ "encoder_activation_function": self.encoder_activation_function,
500
+ "activation_dropout": self.activation_dropout_rate,
501
+ "encoder_ffn_dim": self.encoder_ffn_dim,
502
+ "num_encoder_layers": self.num_encoder_layers,
503
+ "batch_norm_eps": self.batch_norm_eps,
504
+ "hidden_expansion": self.hidden_expansion,
505
+ "depth_multiplier": self.depth_multiplier,
506
+ "kernel_initializer": self.kernel_initializer,
507
+ "bias_initializer": self.bias_initializer,
508
+ "channel_axis": self.channel_axis,
509
+ "data_format": self.data_format,
510
+ }
511
+ )
512
+ return config
513
+
514
+ def compute_output_spec(
515
+ self,
516
+ inputs_embeds,
517
+ attention_mask_spec=None,
518
+ output_attentions=None,
519
+ output_hidden_states=None,
520
+ training=None,
521
+ ):
522
+ output_attentions = (
523
+ output_attentions if output_attentions is not None else False
524
+ )
525
+ output_hidden_states = (
526
+ output_hidden_states if output_hidden_states is not None else False
527
+ )
528
+ hidden_states_specs = list(inputs_embeds)
529
+ encoder_states_tuple_specs = () if output_hidden_states else None
530
+ all_attentions_tuple_specs = () if output_attentions else None
531
+ processed_maps_specs = {}
532
+ if self.num_encoder_layers > 0:
533
+ for i, enc_ind in enumerate(self.encode_proj_layers):
534
+ current_feature_map_spec = hidden_states_specs[enc_ind]
535
+ if output_hidden_states:
536
+ encoder_states_tuple_specs += (
537
+ self.identity(current_feature_map_spec),
538
+ )
539
+ if self.data_format == "channels_last":
540
+ batch_size, h, w, c = current_feature_map_spec.shape
541
+ else:
542
+ batch_size, c, h, w = current_feature_map_spec.shape
543
+ seq_len = h * w if h is not None and w is not None else None
544
+ src_flatten_spec = keras.KerasTensor(
545
+ (batch_size, seq_len, c), dtype=self.compute_dtype
546
+ )
547
+ pos_embed_spec = keras.KerasTensor(
548
+ (batch_size, seq_len, self.encoder_hidden_dim),
549
+ dtype=self.compute_dtype,
550
+ )
551
+ encoder_output_spec = self.encoder[i].compute_output_spec(
552
+ src=src_flatten_spec,
553
+ src_mask=attention_mask_spec,
554
+ pos_embed=pos_embed_spec,
555
+ output_attentions=output_attentions,
556
+ )
557
+ if output_attentions:
558
+ _, layer_attentions_spec = encoder_output_spec
559
+ all_attentions_tuple_specs += (layer_attentions_spec,)
560
+ if self.data_format == "channels_last":
561
+ processed_maps_specs[enc_ind] = keras.KerasTensor(
562
+ (batch_size, h, w, self.encoder_hidden_dim),
563
+ dtype=self.compute_dtype,
564
+ )
565
+ else:
566
+ processed_maps_specs[enc_ind] = keras.KerasTensor(
567
+ (batch_size, self.encoder_hidden_dim, h, w),
568
+ dtype=self.compute_dtype,
569
+ )
570
+ processed_hidden_states_specs = []
571
+ for i in range(len(hidden_states_specs)):
572
+ if i in processed_maps_specs:
573
+ processed_hidden_states_specs.append(processed_maps_specs[i])
574
+ else:
575
+ processed_hidden_states_specs.append(hidden_states_specs[i])
576
+ if self.num_encoder_layers > 0:
577
+ if output_hidden_states:
578
+ encoder_states_tuple_specs += (
579
+ self.identity(
580
+ processed_hidden_states_specs[
581
+ self.encode_proj_layers[-1]
582
+ ]
583
+ ),
584
+ )
585
+ else:
586
+ processed_hidden_states_specs = hidden_states_specs
587
+ fpn_inter_outputs_specs = []
588
+ y_spec = processed_hidden_states_specs[-1]
589
+ for idx, (lateral_conv, fpn_block) in enumerate(
590
+ zip(self.lateral_convs, self.fpn_blocks)
591
+ ):
592
+ backbone_feature_map_k_spec = processed_hidden_states_specs[
593
+ self.num_fpn_stages - idx - 1
594
+ ]
595
+ y_lateral_spec = keras.KerasTensor(
596
+ lateral_conv.compute_output_shape(y_spec.shape),
597
+ dtype=self.compute_dtype,
598
+ )
599
+ fpn_inter_outputs_specs.append(y_lateral_spec)
600
+ y_upsampled_spec = keras.KerasTensor(
601
+ self.upsample.compute_output_shape(y_lateral_spec.shape),
602
+ dtype=self.compute_dtype,
603
+ )
604
+ concat_shape = list(y_upsampled_spec.shape)
605
+ concat_shape[self.channel_axis] += (
606
+ backbone_feature_map_k_spec.shape[self.channel_axis]
607
+ )
608
+ y_spec = keras.KerasTensor(
609
+ fpn_block.compute_output_shape(tuple(concat_shape)),
610
+ dtype=self.compute_dtype,
611
+ )
612
+ fpn_feature_maps_specs = fpn_inter_outputs_specs + [y_spec]
613
+ fpn_feature_maps_specs = fpn_feature_maps_specs[::-1]
614
+ pan_feature_maps_specs = [fpn_feature_maps_specs[0]]
615
+ for idx, (downsample_conv, pan_block) in enumerate(
616
+ zip(self.downsample_convs, self.pan_blocks)
617
+ ):
618
+ top_pan_feature_map_k_spec = pan_feature_maps_specs[-1]
619
+ fpn_feature_map_k_spec = fpn_feature_maps_specs[idx + 1]
620
+ downsampled_feature_map_k_spec = keras.KerasTensor(
621
+ downsample_conv.compute_output_shape(
622
+ top_pan_feature_map_k_spec.shape
623
+ ),
624
+ dtype=self.compute_dtype,
625
+ )
626
+ concat_shape = list(downsampled_feature_map_k_spec.shape)
627
+ concat_shape[self.channel_axis] += fpn_feature_map_k_spec.shape[
628
+ self.channel_axis
629
+ ]
630
+ new_pan_feature_map_k_spec = keras.KerasTensor(
631
+ pan_block.compute_output_shape(tuple(concat_shape)),
632
+ dtype=self.compute_dtype,
633
+ )
634
+ pan_feature_maps_specs.append(new_pan_feature_map_k_spec)
635
+ outputs = [
636
+ tuple(pan_feature_maps_specs),
637
+ ]
638
+ if output_hidden_states:
639
+ outputs.append(encoder_states_tuple_specs)
640
+ if output_attentions:
641
+ outputs.append(all_attentions_tuple_specs)
642
+ return tuple(outputs) if len(outputs) > 1 else outputs[0]
@@ -0,0 +1,8 @@
1
+ from keras_hub.src.api_export import keras_hub_export
2
+ from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
3
+ from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
4
+
5
+
6
+ @keras_hub_export("keras_hub.layers.DFineImageConverter")
7
+ class DFineImageConverter(ImageConverter):
8
+ backbone_cls = DFineBackbone