keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. keras_hub/layers/__init__.py +6 -0
  2. keras_hub/models/__init__.py +21 -0
  3. keras_hub/src/layers/modeling/position_embedding.py +21 -6
  4. keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
  5. keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
  6. keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
  7. keras_hub/src/models/backbone.py +10 -15
  8. keras_hub/src/models/d_fine/__init__.py +0 -0
  9. keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
  10. keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
  11. keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
  12. keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
  13. keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
  14. keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
  15. keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
  16. keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
  17. keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
  18. keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
  19. keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
  20. keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
  21. keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
  22. keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
  23. keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
  24. keras_hub/src/models/parseq/__init__.py +0 -0
  25. keras_hub/src/models/parseq/parseq_backbone.py +134 -0
  26. keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
  27. keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
  28. keras_hub/src/models/parseq/parseq_decoder.py +418 -0
  29. keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
  30. keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
  31. keras_hub/src/tests/test_case.py +37 -1
  32. keras_hub/src/utils/preset_utils.py +49 -0
  33. keras_hub/src/utils/tensor_utils.py +23 -1
  34. keras_hub/src/utils/transformers/convert_vit.py +4 -1
  35. keras_hub/src/version.py +1 -1
  36. keras_hub/tokenizers/__init__.py +3 -0
  37. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
  38. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
  39. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
  40. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,891 @@
1
+ import math
2
+
3
+ import keras
4
+ import numpy as np
5
+
6
+ from keras_hub.src.api_export import keras_hub_export
7
+ from keras_hub.src.models.backbone import Backbone
8
+ from keras_hub.src.models.d_fine.d_fine_decoder import DFineDecoder
9
+ from keras_hub.src.models.d_fine.d_fine_hybrid_encoder import DFineHybridEncoder
10
+ from keras_hub.src.models.d_fine.d_fine_layers import DFineAnchorGenerator
11
+ from keras_hub.src.models.d_fine.d_fine_layers import (
12
+ DFineContrastiveDenoisingGroupGenerator,
13
+ )
14
+ from keras_hub.src.models.d_fine.d_fine_layers import (
15
+ DFineInitialQueryAndReferenceGenerator,
16
+ )
17
+ from keras_hub.src.models.d_fine.d_fine_layers import DFineMLPPredictionHead
18
+ from keras_hub.src.models.d_fine.d_fine_layers import DFineSourceFlattener
19
+ from keras_hub.src.models.d_fine.d_fine_layers import (
20
+ DFineSpatialShapesExtractor,
21
+ )
22
+ from keras_hub.src.models.d_fine.d_fine_utils import d_fine_kernel_initializer
23
+ from keras_hub.src.utils.keras_utils import standardize_data_format
24
+
25
+
26
+ class DFineDenoisingPreprocessorLayer(keras.layers.Layer):
27
+ """Processes and prepares tensors for contrastive denoising.
28
+
29
+ This layer is a helper used within the `DFineBackbone`'s functional model
30
+ definition. Its primary role is to take the outputs from the
31
+ `DFineContrastiveDenoisingGroupGenerator` and prepare them for the dynamic,
32
+ per-batch forward pass, mostly since this functionality cannot be integrated
33
+ directly into the `DFineBackbone` in the symbolic forward pass.
34
+
35
+ The layer takes a tuple of `(pixel_values, input_query_class,
36
+ denoising_bbox_unact, attention_mask)` and an optional
37
+ `denoising_meta_values` dictionary as input to its `call` method.
38
+ """
39
+
40
+ def __init__(self, dtype=None, **kwargs):
41
+ super().__init__(dtype=dtype, **kwargs)
42
+
43
+ def call(self, inputs, denoising_meta_values=None):
44
+ (
45
+ pixel_values,
46
+ input_query_class,
47
+ denoising_bbox_unact,
48
+ attention_mask,
49
+ ) = inputs
50
+ input_query_class_tensor = keras.ops.convert_to_tensor(
51
+ input_query_class, dtype="int32"
52
+ )
53
+ denoising_bbox_unact_tensor = keras.ops.convert_to_tensor(
54
+ denoising_bbox_unact, dtype=self.compute_dtype
55
+ )
56
+ attention_mask_tensor = keras.ops.convert_to_tensor(
57
+ attention_mask, dtype=self.compute_dtype
58
+ )
59
+ outputs = {
60
+ "input_query_class": input_query_class_tensor,
61
+ "denoising_bbox_unact": denoising_bbox_unact_tensor,
62
+ "attention_mask": attention_mask_tensor,
63
+ }
64
+
65
+ if denoising_meta_values is not None:
66
+ batch_size = keras.ops.shape(pixel_values)[0]
67
+ dn_positive_idx = denoising_meta_values["dn_positive_idx"]
68
+ c_batch_size = keras.ops.shape(dn_positive_idx)[0]
69
+ if c_batch_size == 0:
70
+ outputs["dn_positive_idx"] = keras.ops.zeros(
71
+ (batch_size,) + keras.ops.shape(dn_positive_idx)[1:],
72
+ dtype=dn_positive_idx.dtype,
73
+ )
74
+ else:
75
+ num_repeats = (batch_size + c_batch_size - 1) // c_batch_size
76
+ dn_positive_idx_tiled = keras.ops.tile(
77
+ dn_positive_idx,
78
+ (num_repeats,)
79
+ + (1,) * (keras.ops.ndim(dn_positive_idx) - 1),
80
+ )
81
+ outputs["dn_positive_idx"] = dn_positive_idx_tiled[:batch_size]
82
+ dn_num_group = denoising_meta_values["dn_num_group"]
83
+ outputs["dn_num_group"] = keras.ops.tile(
84
+ keras.ops.expand_dims(dn_num_group, 0), (batch_size,)
85
+ )
86
+ dn_num_split = denoising_meta_values["dn_num_split"]
87
+ outputs["dn_num_split"] = keras.ops.tile(
88
+ keras.ops.expand_dims(dn_num_split, 0), (batch_size, 1)
89
+ )
90
+
91
+ return outputs
92
+
93
+
94
+ @keras_hub_export("keras_hub.models.DFineBackbone")
95
+ class DFineBackbone(Backbone):
96
+ """D-FINE Backbone for Object Detection.
97
+
98
+ This class implements the core D-FINE architecture, which serves as the
99
+ backbone for `DFineObjectDetector`. It integrates a `HGNetV2Backbone` for
100
+ initial feature extraction, a `DFineHybridEncoder` for multi-scale feature
101
+ fusion using FPN/PAN pathways, and a `DFineDecoder` for refining object
102
+ queries.
103
+
104
+ The backbone orchestrates the entire forward pass, from processing raw
105
+ pixels to generating intermediate predictions. Key steps include:
106
+ 1. Extracting multi-scale feature maps using the HGNetV2 backbone.
107
+ 2. Fusing these features with the hybrid encoder.
108
+ 3. Generating anchor proposals and selecting the top-k to initialize
109
+ decoder queries and reference points.
110
+ 4. Generating noisy queries for contrastive denoising (if the `labels`
111
+ argument is provided).
112
+ 5. Passing the queries and fused features through the transformer decoder
113
+ to produce iterative predictions for bounding boxes and class logits.
114
+
115
+ Args:
116
+ backbone: A `keras.Model` instance that serves as the feature extractor.
117
+ While any `keras.Model` can be used, we highly recommend using a
118
+ `keras_hub.models.HGNetV2Backbone` instance, as this architecture is
119
+ optimized for its outputs. If a custom backbone is provided, it
120
+ must have a `stage_names` attribute, or the `out_features` argument
121
+ for this model must be specified. This requirement helps prevent
122
+ hard-to-debug downstream dimensionality errors.
123
+ decoder_in_channels: list, Channel dimensions of the multi-scale
124
+ features from the hybrid encoder. This should typically be a list
125
+ of `encoder_hidden_dim` repeated for each feature level.
126
+ encoder_hidden_dim: int, Hidden dimension size for the encoder layers.
127
+ num_labels: int, Number of object classes for detection.
128
+ num_denoising: int, Number of denoising queries for contrastive
129
+ denoising training. Set to `0` to disable denoising.
130
+ learn_initial_query: bool, Whether to learn initial query embeddings.
131
+ num_queries: int, Number of object queries for detection.
132
+ anchor_image_size: tuple, Size of the anchor image as `(height, width)`.
133
+ feat_strides: list, List of feature stride values for different pyramid
134
+ levels.
135
+ num_feature_levels: int, Number of feature pyramid levels to use.
136
+ hidden_dim: int, Hidden dimension size for the model.
137
+ encoder_in_channels: list, Channel dimensions of the feature maps from
138
+ the backbone (`HGNetV2Backbone`) that are fed into the hybrid
139
+ encoder.
140
+ encode_proj_layers: list, List specifying projection layer
141
+ configurations.
142
+ num_attention_heads: int, Number of attention heads in encoder layers.
143
+ encoder_ffn_dim: int, Feed-forward network dimension in encoder.
144
+ num_encoder_layers: int, Number of encoder layers.
145
+ hidden_expansion: float, Hidden dimension expansion factor.
146
+ depth_multiplier: float, Depth multiplier for the backbone.
147
+ eval_idx: int, Index for evaluation. Defaults to `-1` for the last
148
+ layer.
149
+ num_decoder_layers: int, Number of decoder layers.
150
+ decoder_attention_heads: int, Number of attention heads in decoder
151
+ layers.
152
+ decoder_ffn_dim: int, Feed-forward network dimension in decoder.
153
+ decoder_method: str, Decoder method. Can be either `"default"` or
154
+ `"discrete"`. Defaults to `"default"`.
155
+ decoder_n_points: list, Number of sampling points for deformable
156
+ attention.
157
+ lqe_hidden_dim: int, Hidden dimension for learned query embedding.
158
+ num_lqe_layers: int, Number of layers in learned query embedding.
159
+ label_noise_ratio: float, Ratio of label noise for denoising
160
+ training. Defaults to `0.5`.
161
+ box_noise_scale: float, Scale factor for box noise in denoising
162
+ training. Defaults to `1.0`.
163
+ labels: list or None, Ground truth labels for denoising training. This
164
+ is passed during model initialization to construct the training
165
+ graph for contrastive denoising. Each element should be a
166
+ dictionary with `"boxes"` (numpy array of shape `[N, 4]` with
167
+ normalized coordinates) and `"labels"` (numpy array of shape `[N]`
168
+ with class indices). Required when `num_denoising > 0`. Defaults to
169
+ `None`.
170
+ seed: int or None, Random seed for reproducibility. Defaults to `None`.
171
+ image_shape: tuple, Shape of input images as `(height, width,
172
+ channels)`. Height and width can be `None` for variable input sizes.
173
+ Defaults to `(None, None, 3)`.
174
+ out_features: list or None, List of feature names to output from
175
+ backbone. If `None`, uses the last `len(decoder_in_channels)`
176
+ features from the backbone's `stage_names`. Defaults to `None`.
177
+ data_format: str, The data format of the image channels. Can be either
178
+ `"channels_first"` or `"channels_last"`. If `None` is specified,
179
+ it will use the `image_data_format` value found in your Keras
180
+ config file at `~/.keras/keras.json`. Defaults to `None`.
181
+ dtype: `None` or str or `keras.mixed_precision.DTypePolicy`. The dtype
182
+ to use for the model's computations and weights. Defaults to `None`.
183
+ **kwargs: Additional keyword arguments passed to the base class.
184
+
185
+ Example:
186
+ ```python
187
+ import keras
188
+ import numpy as np
189
+ from keras_hub.models import DFineBackbone
190
+ from keras_hub.models import HGNetV2Backbone
191
+
192
+ # Example 1: Basic usage without denoising.
193
+ # First, build the `HGNetV2Backbone` instance.
194
+ hgnetv2 = HGNetV2Backbone(
195
+ stem_channels=[3, 16, 16],
196
+ stackwise_stage_filters=[
197
+ [16, 16, 64, 1, 3, 3],
198
+ [64, 32, 256, 1, 3, 3],
199
+ [256, 64, 512, 2, 3, 5],
200
+ [512, 128, 1024, 1, 3, 5],
201
+ ],
202
+ apply_downsample=[False, True, True, True],
203
+ use_lightweight_conv_block=[False, False, True, True],
204
+ depths=[1, 1, 2, 1],
205
+ hidden_sizes=[64, 256, 512, 1024],
206
+ embedding_size=16,
207
+ use_learnable_affine_block=True,
208
+ hidden_act="relu",
209
+ image_shape=(None, None, 3),
210
+ out_features=["stage3", "stage4"],
211
+ data_format="channels_last",
212
+ )
213
+
214
+ # Then, pass the backbone instance to `DFineBackbone`.
215
+ backbone = DFineBackbone(
216
+ backbone=hgnetv2,
217
+ decoder_in_channels=[128, 128],
218
+ encoder_hidden_dim=128,
219
+ num_denoising=0, # Disable denoising
220
+ num_labels=80,
221
+ hidden_dim=128,
222
+ learn_initial_query=False,
223
+ num_queries=300,
224
+ anchor_image_size=(256, 256),
225
+ feat_strides=[16, 32],
226
+ num_feature_levels=2,
227
+ encoder_in_channels=[512, 1024],
228
+ encode_proj_layers=[1],
229
+ num_attention_heads=8,
230
+ encoder_ffn_dim=512,
231
+ num_encoder_layers=1,
232
+ hidden_expansion=0.34,
233
+ depth_multiplier=0.5,
234
+ eval_idx=-1,
235
+ num_decoder_layers=3,
236
+ decoder_attention_heads=8,
237
+ decoder_ffn_dim=512,
238
+ decoder_n_points=[6, 6],
239
+ lqe_hidden_dim=64,
240
+ num_lqe_layers=2,
241
+ out_features=["stage3", "stage4"],
242
+ image_shape=(None, None, 3),
243
+ data_format="channels_last",
244
+ seed=0,
245
+ )
246
+
247
+ # Prepare input data.
248
+ input_data = keras.random.uniform((2, 256, 256, 3))
249
+
250
+ # Forward pass.
251
+ outputs = backbone(input_data)
252
+
253
+ # Example 2: With contrastive denoising training.
254
+ labels = [
255
+ {
256
+ "boxes": np.array([[0.5, 0.5, 0.2, 0.2], [0.4, 0.4, 0.1, 0.1]]),
257
+ "labels": np.array([1, 10]),
258
+ },
259
+ {
260
+ "boxes": np.array([[0.6, 0.6, 0.3, 0.3]]),
261
+ "labels": np.array([20]),
262
+ },
263
+ ]
264
+
265
+ # Pass the `HGNetV2Backbone` instance to `DFineBackbone`.
266
+ backbone_with_denoising = DFineBackbone(
267
+ backbone=hgnetv2,
268
+ decoder_in_channels=[128, 128],
269
+ encoder_hidden_dim=128,
270
+ num_denoising=100, # Enable denoising
271
+ num_labels=80,
272
+ hidden_dim=128,
273
+ learn_initial_query=False,
274
+ num_queries=300,
275
+ anchor_image_size=(256, 256),
276
+ feat_strides=[16, 32],
277
+ num_feature_levels=2,
278
+ encoder_in_channels=[512, 1024],
279
+ encode_proj_layers=[1],
280
+ num_attention_heads=8,
281
+ encoder_ffn_dim=512,
282
+ num_encoder_layers=1,
283
+ hidden_expansion=0.34,
284
+ depth_multiplier=0.5,
285
+ eval_idx=-1,
286
+ num_decoder_layers=3,
287
+ decoder_attention_heads=8,
288
+ decoder_ffn_dim=512,
289
+ decoder_n_points=[6, 6],
290
+ lqe_hidden_dim=64,
291
+ num_lqe_layers=2,
292
+ out_features=["stage3", "stage4"],
293
+ image_shape=(None, None, 3),
294
+ seed=0,
295
+ labels=labels,
296
+ )
297
+
298
+ # Forward pass with denoising.
299
+ outputs_with_denoising = backbone_with_denoising(input_data)
300
+ ```
301
+ """
302
+
303
+ def __init__(
304
+ self,
305
+ backbone,
306
+ decoder_in_channels,
307
+ encoder_hidden_dim,
308
+ num_labels,
309
+ num_denoising,
310
+ learn_initial_query,
311
+ num_queries,
312
+ anchor_image_size,
313
+ feat_strides,
314
+ num_feature_levels,
315
+ hidden_dim,
316
+ encoder_in_channels,
317
+ encode_proj_layers,
318
+ num_attention_heads,
319
+ encoder_ffn_dim,
320
+ num_encoder_layers,
321
+ hidden_expansion,
322
+ depth_multiplier,
323
+ eval_idx,
324
+ num_decoder_layers,
325
+ decoder_attention_heads,
326
+ decoder_ffn_dim,
327
+ decoder_n_points,
328
+ lqe_hidden_dim,
329
+ num_lqe_layers,
330
+ decoder_method="default",
331
+ label_noise_ratio=0.5,
332
+ box_noise_scale=1.0,
333
+ labels=None,
334
+ seed=None,
335
+ image_shape=(None, None, 3),
336
+ out_features=None,
337
+ data_format=None,
338
+ dtype=None,
339
+ **kwargs,
340
+ ):
341
+ if decoder_method not in ["default", "discrete"]:
342
+ decoder_method = "default"
343
+ data_format = standardize_data_format(data_format)
344
+ channel_axis = -1 if data_format == "channels_last" else 1
345
+ self.backbone = backbone
346
+ # Re-instantiate the backbone if its data_format mismatches the parents.
347
+ if (
348
+ hasattr(self.backbone, "data_format")
349
+ and self.backbone.data_format != data_format
350
+ ):
351
+ backbone_config = self.backbone.get_config()
352
+ backbone_config["data_format"] = data_format
353
+ if (
354
+ "image_shape" in backbone_config
355
+ and backbone_config["image_shape"] is not None
356
+ and len(backbone_config["image_shape"]) == 3
357
+ ):
358
+ backbone_config["image_shape"] = tuple(
359
+ reversed(backbone_config["image_shape"])
360
+ )
361
+ self.backbone = self.backbone.__class__.from_config(backbone_config)
362
+ spatial_shapes = []
363
+ for s in feat_strides:
364
+ h = anchor_image_size[0] // s
365
+ w = anchor_image_size[1] // s
366
+ spatial_shapes.append((h, w))
367
+ # NOTE: While `HGNetV2Backbone` is handled automatically, `out_features`
368
+ # must be specified for custom backbones. This design choice prevents
369
+ # hard-to-debug dimension mismatches by placing the onus on the user for
370
+ # ensuring compatibility.
371
+ if not hasattr(self.backbone, "stage_names") and out_features is None:
372
+ raise ValueError(
373
+ "`out_features` must be specified when using a custom "
374
+ "backbone that does not have a `stage_names` attribute."
375
+ )
376
+ stage_names = getattr(self.backbone, "stage_names", out_features)
377
+ out_features = (
378
+ out_features
379
+ if out_features is not None
380
+ else stage_names[-len(decoder_in_channels) :]
381
+ )
382
+ initializer = d_fine_kernel_initializer(
383
+ initializer_range=0.01,
384
+ )
385
+
386
+ # === Layers ===
387
+ self.encoder = DFineHybridEncoder(
388
+ encoder_in_channels=encoder_in_channels,
389
+ feat_strides=feat_strides,
390
+ encoder_hidden_dim=encoder_hidden_dim,
391
+ encode_proj_layers=encode_proj_layers,
392
+ positional_encoding_temperature=10000,
393
+ eval_size=None,
394
+ normalize_before=False,
395
+ num_attention_heads=num_attention_heads,
396
+ dropout=0.0,
397
+ layer_norm_eps=1e-5,
398
+ encoder_activation_function="gelu",
399
+ activation_dropout=0.0,
400
+ encoder_ffn_dim=encoder_ffn_dim,
401
+ num_encoder_layers=num_encoder_layers,
402
+ batch_norm_eps=1e-5,
403
+ hidden_expansion=hidden_expansion,
404
+ depth_multiplier=depth_multiplier,
405
+ kernel_initializer=initializer,
406
+ bias_initializer="zeros",
407
+ channel_axis=channel_axis,
408
+ data_format=data_format,
409
+ dtype=dtype,
410
+ name="hybrid_encoder",
411
+ )
412
+ self.decoder = DFineDecoder(
413
+ layer_scale=1.0,
414
+ eval_idx=eval_idx,
415
+ num_decoder_layers=num_decoder_layers,
416
+ dropout=0.0,
417
+ hidden_dim=hidden_dim,
418
+ reg_scale=4.0,
419
+ max_num_bins=32,
420
+ upsampling_factor=0.5,
421
+ decoder_attention_heads=decoder_attention_heads,
422
+ attention_dropout=0.0,
423
+ decoder_activation_function="relu",
424
+ activation_dropout=0.0,
425
+ layer_norm_eps=1e-5,
426
+ decoder_ffn_dim=decoder_ffn_dim,
427
+ num_feature_levels=num_feature_levels,
428
+ decoder_offset_scale=0.5,
429
+ decoder_method=decoder_method,
430
+ decoder_n_points=decoder_n_points,
431
+ top_prob_values=4,
432
+ lqe_hidden_dim=lqe_hidden_dim,
433
+ num_lqe_layers=num_lqe_layers,
434
+ num_labels=num_labels,
435
+ spatial_shapes=spatial_shapes,
436
+ dtype=dtype,
437
+ initializer_bias_prior_prob=None,
438
+ num_queries=num_queries,
439
+ name="decoder",
440
+ )
441
+ self.anchor_generator = DFineAnchorGenerator(
442
+ anchor_image_size=anchor_image_size,
443
+ feat_strides=feat_strides,
444
+ data_format=data_format,
445
+ dtype=dtype,
446
+ name="anchor_generator",
447
+ )
448
+ self.contrastive_denoising_group_generator = (
449
+ DFineContrastiveDenoisingGroupGenerator(
450
+ num_labels=num_labels,
451
+ num_denoising=num_denoising,
452
+ label_noise_ratio=label_noise_ratio,
453
+ box_noise_scale=box_noise_scale,
454
+ seed=seed,
455
+ dtype=dtype,
456
+ name="contrastive_denoising_group_generator",
457
+ )
458
+ )
459
+ if num_denoising > 0:
460
+ self.denoising_class_embed = keras.layers.Embedding(
461
+ input_dim=num_labels + 1,
462
+ output_dim=hidden_dim,
463
+ embeddings_initializer="glorot_uniform",
464
+ name="denoising_class_embed",
465
+ dtype=dtype,
466
+ )
467
+ self.denoising_class_embed.build(None)
468
+ else:
469
+ self.denoising_class_embed = None
470
+
471
+ self.source_flattener = DFineSourceFlattener(
472
+ dtype=dtype,
473
+ name="source_flattener",
474
+ channel_axis=channel_axis,
475
+ data_format=data_format,
476
+ )
477
+ self.initial_query_reference_generator = (
478
+ DFineInitialQueryAndReferenceGenerator(
479
+ num_queries=num_queries,
480
+ learn_initial_query=learn_initial_query,
481
+ hidden_dim=hidden_dim,
482
+ dtype=dtype,
483
+ name="initial_query_reference_generator",
484
+ )
485
+ )
486
+ self.spatial_shapes_extractor = DFineSpatialShapesExtractor(
487
+ dtype=dtype,
488
+ data_format=data_format,
489
+ name="spatial_shapes_extractor",
490
+ )
491
+ num_backbone_outs = len(decoder_in_channels)
492
+ self.encoder_input_proj_layers = []
493
+ for i in range(num_backbone_outs):
494
+ self.encoder_input_proj_layers.append(
495
+ [
496
+ keras.layers.Conv2D(
497
+ filters=encoder_hidden_dim,
498
+ kernel_size=1,
499
+ use_bias=False,
500
+ kernel_initializer=initializer,
501
+ bias_initializer="zeros",
502
+ data_format=data_format,
503
+ name=f"encoder_input_proj_conv_{i}",
504
+ dtype=dtype,
505
+ ),
506
+ keras.layers.BatchNormalization(
507
+ epsilon=1e-5,
508
+ axis=channel_axis,
509
+ name=f"encoder_input_proj_bn_{i}",
510
+ dtype=dtype,
511
+ ),
512
+ ]
513
+ )
514
+ self.enc_output_layers = [
515
+ keras.layers.Dense(
516
+ hidden_dim,
517
+ name="enc_output_dense",
518
+ dtype=dtype,
519
+ ),
520
+ keras.layers.LayerNormalization(
521
+ epsilon=1e-5,
522
+ name="enc_output_ln",
523
+ dtype=dtype,
524
+ ),
525
+ ]
526
+ prior_prob = 1 / (num_labels + 1)
527
+ enc_score_head_bias = float(-math.log((1 - prior_prob) / prior_prob))
528
+ self.enc_score_head = keras.layers.Dense(
529
+ num_labels,
530
+ name="enc_score_head",
531
+ dtype=dtype,
532
+ kernel_initializer="glorot_uniform",
533
+ bias_initializer=keras.initializers.Constant(enc_score_head_bias),
534
+ )
535
+ self.enc_bbox_head = DFineMLPPredictionHead(
536
+ input_dim=hidden_dim,
537
+ hidden_dim=hidden_dim,
538
+ output_dim=4,
539
+ num_layers=3,
540
+ name="enc_bbox_head",
541
+ dtype=dtype,
542
+ kernel_initializer=initializer,
543
+ last_layer_initializer="zeros",
544
+ )
545
+ self.decoder_input_proj_layers = []
546
+ for i in range(num_backbone_outs):
547
+ if hidden_dim == decoder_in_channels[-1]:
548
+ proj_layer = keras.layers.Identity(
549
+ name=f"decoder_input_proj_identity_{i}",
550
+ dtype=dtype,
551
+ )
552
+ self.decoder_input_proj_layers.append(proj_layer)
553
+ else:
554
+ self.decoder_input_proj_layers.append(
555
+ [
556
+ keras.layers.Conv2D(
557
+ filters=hidden_dim,
558
+ kernel_size=1,
559
+ use_bias=False,
560
+ kernel_initializer=initializer,
561
+ bias_initializer="zeros",
562
+ data_format=data_format,
563
+ name=f"decoder_input_proj_conv1_{i}",
564
+ dtype=dtype,
565
+ ),
566
+ keras.layers.BatchNormalization(
567
+ epsilon=1e-5,
568
+ axis=channel_axis,
569
+ name=f"decoder_input_proj_bn1_{i}",
570
+ dtype=dtype,
571
+ ),
572
+ ]
573
+ )
574
+ for i in range(num_feature_levels - num_backbone_outs):
575
+ idx = num_backbone_outs + i
576
+ if hidden_dim == decoder_in_channels[-1]:
577
+ proj_layer = keras.layers.Identity(
578
+ name=f"decoder_input_proj_identity_{idx}",
579
+ dtype=dtype,
580
+ )
581
+ self.decoder_input_proj_layers.append(proj_layer)
582
+ else:
583
+ self.decoder_input_proj_layers.append(
584
+ [
585
+ keras.layers.Conv2D(
586
+ filters=hidden_dim,
587
+ kernel_size=3,
588
+ strides=2,
589
+ padding="same",
590
+ use_bias=False,
591
+ kernel_initializer=initializer,
592
+ bias_initializer="zeros",
593
+ data_format=data_format,
594
+ name=f"decoder_input_proj_conv3_{idx}",
595
+ dtype=dtype,
596
+ ),
597
+ keras.layers.BatchNormalization(
598
+ epsilon=1e-5,
599
+ axis=channel_axis,
600
+ name=f"decoder_input_proj_bn3_{idx}",
601
+ dtype=dtype,
602
+ ),
603
+ ]
604
+ )
605
+ self.dn_split_point = None
606
+
607
+ # === Functional Model ===
608
+ pixel_values = keras.Input(
609
+ shape=image_shape, name="pixel_values", dtype="float32"
610
+ )
611
+ feature_maps_output = self.backbone(pixel_values)
612
+ feature_maps = [feature_maps_output[stage] for stage in out_features]
613
+ feature_maps_output_tuple = tuple(feature_maps)
614
+ proj_feats = []
615
+ for level, feature_map in enumerate(feature_maps_output_tuple):
616
+ x = self.encoder_input_proj_layers[level][0](feature_map)
617
+ x = self.encoder_input_proj_layers[level][1](x)
618
+ proj_feats.append(x)
619
+ encoder_outputs = self.encoder(
620
+ inputs_embeds=proj_feats,
621
+ output_hidden_states=True,
622
+ output_attentions=True,
623
+ )
624
+ encoder_last_hidden_state = encoder_outputs[0]
625
+ encoder_hidden_states = (
626
+ encoder_outputs[1] if len(encoder_outputs) > 1 else None
627
+ )
628
+ encoder_attentions = (
629
+ encoder_outputs[2] if len(encoder_outputs) > 2 else None
630
+ )
631
+ last_hidden_state = encoder_outputs[0]
632
+ sources = []
633
+ # NOTE: Handle both no-op (identity mapping) and an actual projection
634
+ # using Conv2D and BatchNorm with `isinstance(proj, list)`.
635
+ for level, source in enumerate(last_hidden_state):
636
+ proj = self.decoder_input_proj_layers[level]
637
+ if isinstance(proj, list):
638
+ x = proj[0](source)
639
+ x = proj[1](x)
640
+ sources.append(x)
641
+ else:
642
+ sources.append(proj(source))
643
+ if num_feature_levels > len(sources):
644
+ len_sources = len(sources)
645
+ proj = self.decoder_input_proj_layers[len_sources]
646
+ if isinstance(proj, list):
647
+ x = proj[0](last_hidden_state[-1])
648
+ x = proj[1](x)
649
+ sources.append(x)
650
+ else:
651
+ sources.append(proj(last_hidden_state[-1]))
652
+ for i in range(len_sources + 1, num_feature_levels):
653
+ proj = self.decoder_input_proj_layers[i]
654
+ if isinstance(proj, list):
655
+ x = proj[0](sources[-1])
656
+ x = proj[1](x)
657
+ sources.append(x)
658
+ else:
659
+ sources.append(proj(sources[-1]))
660
+ spatial_shapes_tensor = self.spatial_shapes_extractor(sources)
661
+ source_flatten = self.source_flattener(sources)
662
+ if num_denoising > 0 and labels is not None:
663
+ (
664
+ input_query_class,
665
+ denoising_bbox_unact,
666
+ attention_mask,
667
+ denoising_meta_values,
668
+ ) = self.contrastive_denoising_group_generator(
669
+ targets=labels,
670
+ num_queries=num_queries,
671
+ )
672
+ self.dn_split_point = int(denoising_meta_values["dn_num_split"][0])
673
+ else:
674
+ (
675
+ denoising_class,
676
+ denoising_bbox_unact,
677
+ attention_mask,
678
+ denoising_meta_values,
679
+ ) = None, None, None, None
680
+
681
+ if num_denoising > 0 and labels is not None:
682
+ denoising_processor = DFineDenoisingPreprocessorLayer(
683
+ name="denoising_processor", dtype=dtype
684
+ )
685
+ denoising_tensors = denoising_processor(
686
+ [
687
+ pixel_values,
688
+ input_query_class,
689
+ denoising_bbox_unact,
690
+ attention_mask,
691
+ ],
692
+ denoising_meta_values=denoising_meta_values,
693
+ )
694
+ input_query_class_tensor = denoising_tensors["input_query_class"]
695
+ denoising_bbox_unact = denoising_tensors["denoising_bbox_unact"]
696
+ attention_mask = denoising_tensors["attention_mask"]
697
+ denoising_class = self.denoising_class_embed(
698
+ input_query_class_tensor
699
+ )
700
+
701
+ anchors, valid_mask = self.anchor_generator(sources)
702
+ memory = keras.ops.where(valid_mask, source_flatten, 0.0)
703
+ output_memory = self.enc_output_layers[0](memory)
704
+ output_memory = self.enc_output_layers[1](output_memory)
705
+ enc_outputs_class = self.enc_score_head(output_memory)
706
+ enc_outputs_coord_logits = self.enc_bbox_head(output_memory)
707
+ enc_outputs_coord_logits_plus_anchors = (
708
+ enc_outputs_coord_logits + anchors
709
+ )
710
+ init_reference_points, target, enc_topk_logits, enc_topk_bboxes = (
711
+ self.initial_query_reference_generator(
712
+ (
713
+ enc_outputs_class,
714
+ enc_outputs_coord_logits_plus_anchors,
715
+ output_memory,
716
+ sources[-1],
717
+ ),
718
+ denoising_bbox_unact=denoising_bbox_unact,
719
+ denoising_class=denoising_class,
720
+ )
721
+ )
722
+ decoder_outputs = self.decoder(
723
+ inputs_embeds=target,
724
+ encoder_hidden_states=source_flatten,
725
+ reference_points=init_reference_points,
726
+ spatial_shapes=spatial_shapes_tensor,
727
+ attention_mask=attention_mask,
728
+ output_hidden_states=True,
729
+ output_attentions=True,
730
+ )
731
+ last_hidden_state = decoder_outputs[0]
732
+ intermediate_hidden_states = decoder_outputs[1]
733
+ intermediate_logits = decoder_outputs[2]
734
+ intermediate_reference_points = decoder_outputs[3]
735
+ intermediate_predicted_corners = decoder_outputs[4]
736
+ initial_reference_points = decoder_outputs[5]
737
+ decoder_hidden_states = (
738
+ decoder_outputs[6] if len(decoder_outputs) > 6 else None
739
+ )
740
+ decoder_attentions = (
741
+ decoder_outputs[7] if len(decoder_outputs) > 7 else None
742
+ )
743
+ cross_attentions = (
744
+ decoder_outputs[8] if len(decoder_outputs) > 8 else None
745
+ )
746
+ outputs = {
747
+ "last_hidden_state": last_hidden_state,
748
+ "intermediate_hidden_states": intermediate_hidden_states,
749
+ "intermediate_logits": intermediate_logits,
750
+ "intermediate_reference_points": intermediate_reference_points,
751
+ "intermediate_predicted_corners": intermediate_predicted_corners,
752
+ "initial_reference_points": initial_reference_points,
753
+ "decoder_hidden_states": decoder_hidden_states,
754
+ "decoder_attentions": decoder_attentions,
755
+ "cross_attentions": cross_attentions,
756
+ "encoder_last_hidden_state": encoder_last_hidden_state[0],
757
+ "encoder_hidden_states": encoder_hidden_states,
758
+ "encoder_attentions": encoder_attentions,
759
+ "init_reference_points": init_reference_points,
760
+ "enc_topk_logits": enc_topk_logits,
761
+ "enc_topk_bboxes": enc_topk_bboxes,
762
+ "enc_outputs_class": enc_outputs_class,
763
+ "enc_outputs_coord_logits": enc_outputs_coord_logits,
764
+ }
765
+
766
+ if num_denoising > 0 and labels is not None:
767
+ outputs["dn_positive_idx"] = denoising_tensors["dn_positive_idx"]
768
+ outputs["dn_num_group"] = denoising_tensors["dn_num_group"]
769
+ outputs["dn_num_split"] = denoising_tensors["dn_num_split"]
770
+
771
+ outputs = {k: v for k, v in outputs.items() if v is not None}
772
+ super().__init__(
773
+ inputs=pixel_values,
774
+ outputs=outputs,
775
+ dtype=dtype,
776
+ **kwargs,
777
+ )
778
+
779
+ # === Config ===
780
+ self.decoder_in_channels = decoder_in_channels
781
+ self.encoder_hidden_dim = encoder_hidden_dim
782
+ self.num_labels = num_labels
783
+ self.num_denoising = num_denoising
784
+ self.learn_initial_query = learn_initial_query
785
+ self.num_queries = num_queries
786
+ self.anchor_image_size = anchor_image_size
787
+ self.feat_strides = feat_strides
788
+ self.num_feature_levels = num_feature_levels
789
+ self.hidden_dim = hidden_dim
790
+ self.encoder_in_channels = encoder_in_channels
791
+ self.encode_proj_layers = encode_proj_layers
792
+ self.num_attention_heads = num_attention_heads
793
+ self.encoder_ffn_dim = encoder_ffn_dim
794
+ self.num_encoder_layers = num_encoder_layers
795
+ self.hidden_expansion = hidden_expansion
796
+ self.depth_multiplier = depth_multiplier
797
+ self.eval_idx = eval_idx
798
+ self.box_noise_scale = box_noise_scale
799
+ self.labels = labels
800
+ self.label_noise_ratio = label_noise_ratio
801
+ self.num_decoder_layers = num_decoder_layers
802
+ self.decoder_attention_heads = decoder_attention_heads
803
+ self.decoder_ffn_dim = decoder_ffn_dim
804
+ self.decoder_method = decoder_method
805
+ self.decoder_n_points = decoder_n_points
806
+ self.lqe_hidden_dim = lqe_hidden_dim
807
+ self.num_lqe_layers = num_lqe_layers
808
+ self.data_format = data_format
809
+ self.seed = seed
810
+ self.image_shape = image_shape
811
+ self.channel_axis = channel_axis
812
+ self.spatial_shapes = spatial_shapes
813
+ self.stage_names = stage_names
814
+ self.out_features = out_features
815
+ self.initializer = initializer
816
+
817
+ def get_config(self):
818
+ config = super().get_config()
819
+ serializable_labels = None
820
+ if self.labels is not None:
821
+ serializable_labels = []
822
+ for target in self.labels:
823
+ serializable_target = {}
824
+ for key, value in target.items():
825
+ if hasattr(value, "tolist"):
826
+ serializable_target[key] = value.tolist()
827
+ else:
828
+ serializable_target[key] = value
829
+ serializable_labels.append(serializable_target)
830
+ config.update(
831
+ {
832
+ "backbone": keras.layers.serialize(self.backbone),
833
+ "decoder_in_channels": self.decoder_in_channels,
834
+ "encoder_hidden_dim": self.encoder_hidden_dim,
835
+ "num_labels": self.num_labels,
836
+ "num_denoising": self.num_denoising,
837
+ "learn_initial_query": self.learn_initial_query,
838
+ "num_queries": self.num_queries,
839
+ "anchor_image_size": self.anchor_image_size,
840
+ "feat_strides": self.feat_strides,
841
+ "num_feature_levels": self.num_feature_levels,
842
+ "hidden_dim": self.hidden_dim,
843
+ "encoder_in_channels": self.encoder_in_channels,
844
+ "encode_proj_layers": self.encode_proj_layers,
845
+ "num_attention_heads": self.num_attention_heads,
846
+ "encoder_ffn_dim": self.encoder_ffn_dim,
847
+ "num_encoder_layers": self.num_encoder_layers,
848
+ "hidden_expansion": self.hidden_expansion,
849
+ "depth_multiplier": self.depth_multiplier,
850
+ "eval_idx": self.eval_idx,
851
+ "box_noise_scale": self.box_noise_scale,
852
+ "label_noise_ratio": self.label_noise_ratio,
853
+ "labels": serializable_labels,
854
+ "num_decoder_layers": self.num_decoder_layers,
855
+ "decoder_attention_heads": self.decoder_attention_heads,
856
+ "decoder_ffn_dim": self.decoder_ffn_dim,
857
+ "decoder_method": self.decoder_method,
858
+ "decoder_n_points": self.decoder_n_points,
859
+ "lqe_hidden_dim": self.lqe_hidden_dim,
860
+ "num_lqe_layers": self.num_lqe_layers,
861
+ "seed": self.seed,
862
+ "image_shape": self.image_shape,
863
+ "data_format": self.data_format,
864
+ "out_features": self.out_features,
865
+ }
866
+ )
867
+ return config
868
+
869
+ @classmethod
870
+ def from_config(cls, config, custom_objects=None):
871
+ config = config.copy()
872
+ if "labels" in config and config["labels"] is not None:
873
+ labels = config["labels"]
874
+ deserialized_labels = []
875
+ for target in labels:
876
+ deserialized_target = {}
877
+ for key, value in target.items():
878
+ if isinstance(value, list):
879
+ deserialized_target[key] = np.array(value)
880
+ else:
881
+ deserialized_target[key] = value
882
+ deserialized_labels.append(deserialized_target)
883
+ config["labels"] = deserialized_labels
884
+ if "dtype" in config and config["dtype"] is not None:
885
+ dtype_config = config["dtype"]
886
+ if "dtype" not in config["backbone"]["config"]:
887
+ config["backbone"]["config"]["dtype"] = dtype_config
888
+ config["backbone"] = keras.layers.deserialize(
889
+ config["backbone"], custom_objects=custom_objects
890
+ )
891
+ return cls(**config)