keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. keras_hub/layers/__init__.py +6 -0
  2. keras_hub/models/__init__.py +21 -0
  3. keras_hub/src/layers/modeling/position_embedding.py +21 -6
  4. keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
  5. keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
  6. keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
  7. keras_hub/src/models/backbone.py +10 -15
  8. keras_hub/src/models/d_fine/__init__.py +0 -0
  9. keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
  10. keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
  11. keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
  12. keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
  13. keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
  14. keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
  15. keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
  16. keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
  17. keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
  18. keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
  19. keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
  20. keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
  21. keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
  22. keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
  23. keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
  24. keras_hub/src/models/parseq/__init__.py +0 -0
  25. keras_hub/src/models/parseq/parseq_backbone.py +134 -0
  26. keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
  27. keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
  28. keras_hub/src/models/parseq/parseq_decoder.py +418 -0
  29. keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
  30. keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
  31. keras_hub/src/tests/test_case.py +37 -1
  32. keras_hub/src/utils/preset_utils.py +49 -0
  33. keras_hub/src/utils/tensor_utils.py +23 -1
  34. keras_hub/src/utils/transformers/convert_vit.py +4 -1
  35. keras_hub/src/version.py +1 -1
  36. keras_hub/tokenizers/__init__.py +3 -0
  37. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
  38. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
  39. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
  40. {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,875 @@
1
+ import keras
2
+
3
+ from keras_hub.src.api_export import keras_hub_export
4
+ from keras_hub.src.layers.modeling.non_max_supression import NonMaxSuppression
5
+ from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
6
+ from keras_hub.src.models.d_fine.d_fine_loss import compute_box_losses
7
+ from keras_hub.src.models.d_fine.d_fine_loss import compute_local_losses
8
+ from keras_hub.src.models.d_fine.d_fine_loss import compute_vfl_loss
9
+ from keras_hub.src.models.d_fine.d_fine_loss import get_cdn_matched_indices
10
+ from keras_hub.src.models.d_fine.d_fine_loss import hungarian_matcher
11
+ from keras_hub.src.models.d_fine.d_fine_object_detector_preprocessor import (
12
+ DFineObjectDetectorPreprocessor,
13
+ )
14
+ from keras_hub.src.models.object_detector import ObjectDetector
15
+ from keras_hub.src.utils.tensor_utils import assert_bounding_box_support
16
+
17
+
18
+ @keras_hub_export("keras_hub.models.DFineObjectDetector")
19
+ class DFineObjectDetector(ObjectDetector):
20
+ """D-FINE Object Detector model.
21
+
22
+ This class wraps the `DFineBackbone` and adds the final prediction and loss
23
+ computation logic for end-to-end object detection. It is responsible for:
24
+ 1. Defining the functional model that connects the `DFineBackbone` to the
25
+ input layers.
26
+ 2. Implementing the `compute_loss` method, which uses a Hungarian matcher
27
+ to assign predictions to ground truth targets and calculates a weighted
28
+ sum of multiple loss components (classification, bounding box, etc.).
29
+ 3. Post-processing the raw outputs from the backbone into final, decoded
30
+ predictions (boxes, labels, confidence scores) during inference.
31
+
32
+ Args:
33
+ backbone: A `keras_hub.models.Backbone` instance, specifically a
34
+ `DFineBackbone`, serving as the feature extractor for the object
35
+ detector.
36
+ num_classes: An integer representing the number of object classes to
37
+ detect.
38
+ bounding_box_format: A string specifying the format of the bounding
39
+ boxes. Defaults to `"yxyx"`. Must be a supported format (e.g.,
40
+ `"yxyx"`, `"xyxy"`).
41
+ preprocessor: Optional. An instance of `DFineObjectDetectorPreprocessor`
42
+ for input data preprocessing.
43
+ matcher_class_cost: A float representing the cost for class mismatch in
44
+ the Hungarian matcher. Defaults to `2.0`.
45
+ matcher_bbox_cost: A float representing the cost for bounding box
46
+ mismatch in the Hungarian matcher. Defaults to `5.0`.
47
+ matcher_ciou_cost: A float representing the cost for complete IoU
48
+ mismatch in the Hungarian matcher. Defaults to `2.0`.
49
+ use_focal_loss: A boolean indicating whether to use focal loss for
50
+ classification. Defaults to `True`.
51
+ matcher_alpha: A float parameter for the focal loss alpha. Defaults to
52
+ `0.25`.
53
+ matcher_gamma: A float parameter for the focal loss gamma. Defaults to
54
+ `2.0`.
55
+ weight_loss_vfl: Weight for the classification loss. Defaults to `1.0`.
56
+ weight_loss_bbox: Weight for the bounding box regression loss. Default
57
+ is `5.0`.
58
+ weight_loss_ciou: Weight for the complete IoU loss. Defaults to
59
+ `2.0`.
60
+ weight_loss_fgl: Weight for the focal grid loss. Defaults to `0.15`.
61
+ weight_loss_ddf: Weight for the DDF loss. Defaults to `1.5`.
62
+ ddf_temperature: A float temperature scaling factor for the DDF loss.
63
+ Defaults to `5.0`.
64
+ prediction_decoder: Optional. A `keras.layers.Layer` instance that
65
+ decodes raw predictions. If not provided, a `NonMaxSuppression`
66
+ layer is used.
67
+ activation: Optional. The activation function to apply to the logits
68
+ before decoding. Defaults to `None`.
69
+
70
+ Examples:
71
+
72
+ **Creating a DFineObjectDetector without labels:**
73
+
74
+ ```python
75
+ import numpy as np
76
+ from keras_hub.src.models.d_fine.d_fine_object_detector import (
77
+ DFineObjectDetector
78
+ )
79
+ from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
80
+ from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
81
+
82
+ # Initialize the backbone without labels.
83
+ hgnetv2_backbone = HGNetV2Backbone(
84
+ stem_channels=[3, 16, 16],
85
+ stackwise_stage_filters=[
86
+ [16, 16, 64, 1, 3, 3],
87
+ [64, 32, 256, 1, 3, 3],
88
+ [256, 64, 512, 2, 3, 5],
89
+ [512, 128, 1024, 1, 3, 5],
90
+ ],
91
+ apply_downsample=[False, True, True, True],
92
+ use_lightweight_conv_block=[False, False, True, True],
93
+ depths=[1, 1, 2, 1],
94
+ hidden_sizes=[64, 256, 512, 1024],
95
+ embedding_size=16,
96
+ use_learnable_affine_block=True,
97
+ hidden_act="relu",
98
+ image_shape=(256, 256, 3),
99
+ out_features=["stage3", "stage4"],
100
+ )
101
+
102
+ # Initialize the backbone without labels.
103
+ backbone = DFineBackbone(
104
+ backbone=hgnetv2_backbone,
105
+ decoder_in_channels=[128, 128],
106
+ encoder_hidden_dim=128,
107
+ num_denoising=100,
108
+ num_labels=80,
109
+ hidden_dim=128,
110
+ learn_initial_query=False,
111
+ num_queries=300,
112
+ anchor_image_size=(256, 256),
113
+ feat_strides=[16, 32],
114
+ num_feature_levels=2,
115
+ encoder_in_channels=[512, 1024],
116
+ encode_proj_layers=[1],
117
+ num_attention_heads=8,
118
+ encoder_ffn_dim=512,
119
+ num_encoder_layers=1,
120
+ hidden_expansion=0.34,
121
+ depth_multiplier=0.5,
122
+ eval_idx=-1,
123
+ num_decoder_layers=3,
124
+ decoder_attention_heads=8,
125
+ decoder_ffn_dim=512,
126
+ decoder_n_points=[6, 6],
127
+ lqe_hidden_dim=64,
128
+ num_lqe_layers=2,
129
+ out_features=["stage3", "stage4"],
130
+ image_shape=(256, 256, 3),
131
+ )
132
+
133
+ # Create the detector.
134
+ detector = DFineObjectDetector(
135
+ backbone=backbone,
136
+ num_classes=80,
137
+ bounding_box_format="yxyx",
138
+ )
139
+ ```
140
+
141
+ **Creating a DFineObjectDetector with labels for the backbone:**
142
+
143
+ ```python
144
+ import numpy as np
145
+ from keras_hub.src.models.d_fine.d_fine_object_detector import (
146
+ DFineObjectDetector
147
+ )
148
+ from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
149
+ from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
150
+
151
+ # Define labels for the backbone.
152
+ labels = [
153
+ {
154
+ "boxes": np.array([[0.5, 0.5, 0.2, 0.2], [0.4, 0.4, 0.1, 0.1]]),
155
+ "labels": np.array([1, 10])
156
+ },
157
+ {"boxes": np.array([[0.6, 0.6, 0.3, 0.3]]), "labels": np.array([20])},
158
+ ]
159
+
160
+ hgnetv2_backbone = HGNetV2Backbone(
161
+ stem_channels=[3, 16, 16],
162
+ stackwise_stage_filters=[
163
+ [16, 16, 64, 1, 3, 3],
164
+ [64, 32, 256, 1, 3, 3],
165
+ [256, 64, 512, 2, 3, 5],
166
+ [512, 128, 1024, 1, 3, 5],
167
+ ],
168
+ apply_downsample=[False, True, True, True],
169
+ use_lightweight_conv_block=[False, False, True, True],
170
+ depths=[1, 1, 2, 1],
171
+ hidden_sizes=[64, 256, 512, 1024],
172
+ embedding_size=16,
173
+ use_learnable_affine_block=True,
174
+ hidden_act="relu",
175
+ image_shape=(256, 256, 3),
176
+ out_features=["stage3", "stage4"],
177
+ )
178
+
179
+ # Backbone is initialized with labels.
180
+ backbone = DFineBackbone(
181
+ backbone=hgnetv2_backbone,
182
+ decoder_in_channels=[128, 128],
183
+ encoder_hidden_dim=128,
184
+ num_denoising=100,
185
+ num_labels=80,
186
+ hidden_dim=128,
187
+ learn_initial_query=False,
188
+ num_queries=300,
189
+ anchor_image_size=(256, 256),
190
+ feat_strides=[16, 32],
191
+ num_feature_levels=2,
192
+ encoder_in_channels=[512, 1024],
193
+ encode_proj_layers=[1],
194
+ num_attention_heads=8,
195
+ encoder_ffn_dim=512,
196
+ num_encoder_layers=1,
197
+ hidden_expansion=0.34,
198
+ depth_multiplier=0.5,
199
+ eval_idx=-1,
200
+ num_decoder_layers=3,
201
+ decoder_attention_heads=8,
202
+ decoder_ffn_dim=512,
203
+ decoder_n_points=[6, 6],
204
+ lqe_hidden_dim=64,
205
+ num_lqe_layers=2,
206
+ out_features=["stage3", "stage4"],
207
+ image_shape=(256, 256, 3),
208
+ labels=labels,
209
+ box_noise_scale=1.0,
210
+ label_noise_ratio=0.5,
211
+ )
212
+
213
+ # Create the detector.
214
+ detector = DFineObjectDetector(
215
+ backbone=backbone,
216
+ num_classes=80,
217
+ bounding_box_format="yxyx",
218
+ )
219
+ ```
220
+
221
+ **Using the detector for training:**
222
+
223
+ ```python
224
+ import numpy as np
225
+ from keras_hub.src.models.d_fine.d_fine_object_detector import (
226
+ DFineObjectDetector
227
+ )
228
+ from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
229
+ from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
230
+
231
+ # Initialize backbone and detector.
232
+ hgnetv2_backbone = HGNetV2Backbone(
233
+ stem_channels=[3, 16, 16],
234
+ stackwise_stage_filters=[
235
+ [16, 16, 64, 1, 3, 3],
236
+ [64, 32, 256, 1, 3, 3],
237
+ [256, 64, 512, 2, 3, 5],
238
+ [512, 128, 1024, 1, 3, 5],
239
+ ],
240
+ apply_downsample=[False, True, True, True],
241
+ use_lightweight_conv_block=[False, False, True, True],
242
+ depths=[1, 1, 2, 1],
243
+ hidden_sizes=[64, 256, 512, 1024],
244
+ embedding_size=16,
245
+ use_learnable_affine_block=True,
246
+ hidden_act="relu",
247
+ image_shape=(256, 256, 3),
248
+ out_features=["stage3", "stage4"],
249
+ )
250
+ backbone = DFineBackbone(
251
+ backbone=hgnetv2_backbone,
252
+ decoder_in_channels=[128, 128],
253
+ encoder_hidden_dim=128,
254
+ num_denoising=100,
255
+ num_labels=80,
256
+ hidden_dim=128,
257
+ learn_initial_query=False,
258
+ num_queries=300,
259
+ anchor_image_size=(256, 256),
260
+ feat_strides=[16, 32],
261
+ num_feature_levels=2,
262
+ encoder_in_channels=[512, 1024],
263
+ encode_proj_layers=[1],
264
+ num_attention_heads=8,
265
+ encoder_ffn_dim=512,
266
+ num_encoder_layers=1,
267
+ hidden_expansion=0.34,
268
+ depth_multiplier=0.5,
269
+ eval_idx=-1,
270
+ num_decoder_layers=3,
271
+ decoder_attention_heads=8,
272
+ decoder_ffn_dim=512,
273
+ decoder_n_points=[6, 6],
274
+ lqe_hidden_dim=64,
275
+ num_lqe_layers=2,
276
+ out_features=["stage3", "stage4"],
277
+ image_shape=(256, 256, 3),
278
+ )
279
+ detector = DFineObjectDetector(
280
+ backbone=backbone,
281
+ num_classes=80,
282
+ bounding_box_format="yxyx",
283
+ )
284
+
285
+ # Sample training data.
286
+ images = np.random.uniform(
287
+ low=0, high=255, size=(2, 256, 256, 3)
288
+ ).astype("float32")
289
+ bounding_boxes = {
290
+ "boxes": [
291
+ np.array([[10.0, 20.0, 20.0, 30.0], [20.0, 30.0, 30.0, 40.0]]),
292
+ np.array([[15.0, 25.0, 25.0, 35.0]]),
293
+ ],
294
+ "labels": [
295
+ np.array([0, 2]), np.array([1])
296
+ ],
297
+ }
298
+
299
+ # Compile the model.
300
+ detector.compile(
301
+ optimizer="adam",
302
+ loss=detector.compute_loss,
303
+ )
304
+
305
+ # Train the model.
306
+ detector.fit(x=images, y=bounding_boxes, epochs=1, batch_size=1)
307
+ ```
308
+
309
+ **Making predictions:**
310
+
311
+ ```python
312
+ import numpy as np
313
+ from keras_hub.src.models.d_fine.d_fine_object_detector import (
314
+ DFineObjectDetector
315
+ )
316
+ from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
317
+ from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
318
+
319
+ # Initialize backbone and detector.
320
+ hgnetv2_backbone = HGNetV2Backbone(
321
+ stem_channels=[3, 16, 16],
322
+ stackwise_stage_filters=[
323
+ [16, 16, 64, 1, 3, 3],
324
+ [64, 32, 256, 1, 3, 3],
325
+ [256, 64, 512, 2, 3, 5],
326
+ [512, 128, 1024, 1, 3, 5],
327
+ ],
328
+ apply_downsample=[False, True, True, True],
329
+ use_lightweight_conv_block=[False, False, True, True],
330
+ depths=[1, 1, 2, 1],
331
+ hidden_sizes=[64, 256, 512, 1024],
332
+ embedding_size=16,
333
+ use_learnable_affine_block=True,
334
+ hidden_act="relu",
335
+ image_shape=(256, 256, 3),
336
+ out_features=["stage3", "stage4"],
337
+ )
338
+ backbone = DFineBackbone(
339
+ backbone=hgnetv2_backbone,
340
+ decoder_in_channels=[128, 128],
341
+ encoder_hidden_dim=128,
342
+ num_denoising=100,
343
+ num_labels=80,
344
+ hidden_dim=128,
345
+ learn_initial_query=False,
346
+ num_queries=300,
347
+ anchor_image_size=(256, 256),
348
+ feat_strides=[16, 32],
349
+ num_feature_levels=2,
350
+ encoder_in_channels=[512, 1024],
351
+ encode_proj_layers=[1],
352
+ num_attention_heads=8,
353
+ encoder_ffn_dim=512,
354
+ num_encoder_layers=1,
355
+ hidden_expansion=0.34,
356
+ depth_multiplier=0.5,
357
+ eval_idx=-1,
358
+ num_decoder_layers=3,
359
+ decoder_attention_heads=8,
360
+ decoder_ffn_dim=512,
361
+ decoder_n_points=[6, 6],
362
+ lqe_hidden_dim=64,
363
+ num_lqe_layers=2,
364
+ out_features=["stage3", "stage4"],
365
+ image_shape=(256, 256, 3),
366
+ )
367
+ detector = DFineObjectDetector(
368
+ backbone=backbone,
369
+ num_classes=80,
370
+ bounding_box_format="yxyx",
371
+ )
372
+
373
+ # Sample test image.
374
+ test_image = np.random.uniform(
375
+ low=0, high=255, size=(1, 256, 256, 3)
376
+ ).astype("float32")
377
+
378
+ # Make predictions.
379
+ predictions = detector.predict(test_image)
380
+
381
+ # Access predictions.
382
+ boxes = predictions["boxes"] # Shape: (1, 100, 4)
383
+ labels = predictions["labels"] # Shape: (1, 100)
384
+ confidence = predictions["confidence"] # Shape: (1, 100)
385
+ num_detections = predictions["num_detections"] # Shape: (1,)
386
+ ```
387
+ """
388
+
389
+ backbone_cls = DFineBackbone
390
+ preprocessor_cls = DFineObjectDetectorPreprocessor
391
+
392
+ def __init__(
393
+ self,
394
+ backbone,
395
+ num_classes,
396
+ bounding_box_format="yxyx",
397
+ preprocessor=None,
398
+ matcher_class_cost=2.0,
399
+ matcher_bbox_cost=5.0,
400
+ matcher_ciou_cost=2.0,
401
+ use_focal_loss=True,
402
+ matcher_alpha=0.25,
403
+ matcher_gamma=2.0,
404
+ weight_loss_vfl=1.0,
405
+ weight_loss_bbox=5.0,
406
+ weight_loss_ciou=2.0,
407
+ weight_loss_fgl=0.15,
408
+ weight_loss_ddf=1.5,
409
+ ddf_temperature=5.0,
410
+ prediction_decoder=None,
411
+ activation=None,
412
+ **kwargs,
413
+ ):
414
+ assert_bounding_box_support(self.__class__.__name__)
415
+
416
+ # === Functional Model ===
417
+ image_input = keras.layers.Input(
418
+ shape=backbone.image_shape, name="images"
419
+ )
420
+ outputs = backbone(image_input)
421
+ intermediate_logits = outputs["intermediate_logits"]
422
+ intermediate_reference_points = outputs["intermediate_reference_points"]
423
+ intermediate_predicted_corners = outputs[
424
+ "intermediate_predicted_corners"
425
+ ]
426
+ initial_reference_points = outputs["initial_reference_points"]
427
+ logits = intermediate_logits[:, -1, :, :]
428
+ pred_boxes = intermediate_reference_points[:, -1, :, :]
429
+ model_outputs = {
430
+ "logits": logits,
431
+ "pred_boxes": pred_boxes,
432
+ "intermediate_logits": intermediate_logits,
433
+ "intermediate_reference_points": intermediate_reference_points,
434
+ "intermediate_predicted_corners": intermediate_predicted_corners,
435
+ "initial_reference_points": initial_reference_points,
436
+ "enc_topk_logits": outputs["enc_topk_logits"],
437
+ "enc_topk_bboxes": outputs["enc_topk_bboxes"],
438
+ }
439
+ if "dn_num_group" in outputs:
440
+ model_outputs["dn_positive_idx"] = outputs["dn_positive_idx"]
441
+ model_outputs["dn_num_group"] = outputs["dn_num_group"]
442
+ model_outputs["dn_num_split"] = outputs["dn_num_split"]
443
+ super().__init__(
444
+ inputs=image_input,
445
+ outputs=model_outputs,
446
+ **kwargs,
447
+ )
448
+
449
+ # === Config ===
450
+ self.backbone = backbone
451
+ self.num_classes = num_classes
452
+ self.bounding_box_format = bounding_box_format
453
+ self.preprocessor = preprocessor
454
+ self.matcher_class_cost = matcher_class_cost
455
+ self.matcher_bbox_cost = matcher_bbox_cost
456
+ self.matcher_ciou_cost = matcher_ciou_cost
457
+ self.use_focal_loss = use_focal_loss
458
+ self.matcher_alpha = matcher_alpha
459
+ self.matcher_gamma = matcher_gamma
460
+ self.weight_dict = {
461
+ "loss_vfl": weight_loss_vfl,
462
+ "loss_bbox": weight_loss_bbox,
463
+ "loss_ciou": weight_loss_ciou,
464
+ "loss_fgl": weight_loss_fgl,
465
+ "loss_ddf": weight_loss_ddf,
466
+ }
467
+ self.ddf_temperature = ddf_temperature
468
+ self.activation = activation
469
+ self._prediction_decoder = prediction_decoder or NonMaxSuppression(
470
+ from_logits=(self.activation != keras.activations.sigmoid),
471
+ bounding_box_format=self.bounding_box_format,
472
+ max_detections=backbone.num_queries,
473
+ )
474
+
475
+ def compute_loss(self, x, y, y_pred, sample_weight, **kwargs):
476
+ gt_boxes = y["boxes"]
477
+ gt_labels = y["labels"]
478
+ batch_size = keras.ops.shape(gt_labels)[0]
479
+ num_objects = keras.ops.shape(gt_labels)[1]
480
+ num_targets_per_image = keras.ops.tile(
481
+ keras.ops.expand_dims(num_objects, 0), [batch_size]
482
+ )
483
+ labels_for_item = keras.ops.reshape(gt_labels, [-1])
484
+ boxes_for_item = keras.ops.reshape(gt_boxes, [-1, 4])
485
+ targets = {"labels": labels_for_item, "boxes": boxes_for_item}
486
+
487
+ intermediate_logits_all = y_pred["intermediate_logits"]
488
+ intermediate_ref_points_all = y_pred["intermediate_reference_points"]
489
+ predicted_corners_all = y_pred["intermediate_predicted_corners"]
490
+ initial_ref_points_all = y_pred["initial_reference_points"]
491
+ enc_topk_logits = y_pred["enc_topk_logits"]
492
+ enc_topk_bboxes = y_pred["enc_topk_bboxes"]
493
+ if "dn_num_group" in y_pred:
494
+ denoising_meta_values = {
495
+ "dn_positive_idx": y_pred["dn_positive_idx"],
496
+ "dn_num_group": y_pred["dn_num_group"],
497
+ "dn_num_split": y_pred["dn_num_split"],
498
+ }
499
+ dn_split_point = self.backbone.dn_split_point
500
+ (
501
+ dn_intermediate_logits,
502
+ matching_intermediate_logits,
503
+ ) = keras.ops.split(
504
+ intermediate_logits_all, [dn_split_point], axis=2
505
+ )
506
+ (
507
+ dn_intermediate_ref_points,
508
+ matching_intermediate_ref_points,
509
+ ) = keras.ops.split(
510
+ intermediate_ref_points_all, [dn_split_point], axis=2
511
+ )
512
+ (
513
+ dn_predicted_corners,
514
+ matching_predicted_corners,
515
+ ) = keras.ops.split(predicted_corners_all, [dn_split_point], axis=2)
516
+ (
517
+ dn_initial_ref_points,
518
+ matching_initial_ref_points,
519
+ ) = keras.ops.split(
520
+ initial_ref_points_all, [dn_split_point], axis=2
521
+ )
522
+ else:
523
+ denoising_meta_values = None
524
+ matching_intermediate_logits = intermediate_logits_all
525
+ matching_intermediate_ref_points = intermediate_ref_points_all
526
+ matching_predicted_corners = predicted_corners_all
527
+ matching_initial_ref_points = initial_ref_points_all
528
+ matching_logits = matching_intermediate_logits[:, -1, :, :]
529
+ matching_pred_boxes = matching_intermediate_ref_points[:, -1, :, :]
530
+ outputs_without_aux = {
531
+ "logits": matching_logits,
532
+ "pred_boxes": keras.ops.clip(matching_pred_boxes, 0, 1),
533
+ }
534
+ indices = hungarian_matcher(
535
+ outputs_without_aux,
536
+ [targets],
537
+ num_targets_per_image,
538
+ self.use_focal_loss,
539
+ self.matcher_alpha,
540
+ self.matcher_gamma,
541
+ self.matcher_bbox_cost,
542
+ self.matcher_class_cost,
543
+ self.matcher_ciou_cost,
544
+ self.backbone,
545
+ )
546
+ num_boxes = keras.ops.shape(labels_for_item)[0]
547
+ num_boxes = keras.ops.convert_to_tensor(num_boxes, dtype="float32")
548
+ num_boxes = keras.ops.maximum(num_boxes, 1.0)
549
+ losses = {}
550
+ vfl_loss = compute_vfl_loss(
551
+ outputs_without_aux,
552
+ [targets],
553
+ indices,
554
+ num_boxes,
555
+ self.num_classes,
556
+ self.matcher_alpha,
557
+ self.matcher_gamma,
558
+ )
559
+ losses.update(
560
+ {
561
+ k: v * self.weight_dict[k]
562
+ for k, v in vfl_loss.items()
563
+ if k in self.weight_dict
564
+ }
565
+ )
566
+ box_losses = compute_box_losses(
567
+ outputs_without_aux, [targets], indices, num_boxes
568
+ )
569
+ losses.update(
570
+ {
571
+ k: v * self.weight_dict[k]
572
+ for k, v in box_losses.items()
573
+ if k in self.weight_dict
574
+ }
575
+ )
576
+ local_losses = compute_local_losses(
577
+ {
578
+ **outputs_without_aux,
579
+ "pred_corners": matching_predicted_corners[:, -1, :, :],
580
+ "ref_points": matching_initial_ref_points[:, -1, :, :],
581
+ "teacher_corners": keras.ops.zeros_like(
582
+ matching_predicted_corners[:, -1, :, :]
583
+ ),
584
+ "teacher_logits": keras.ops.zeros_like(matching_logits),
585
+ },
586
+ [targets],
587
+ indices,
588
+ num_boxes,
589
+ self.backbone,
590
+ self.ddf_temperature,
591
+ compute_ddf=False,
592
+ )
593
+ losses.update(
594
+ {
595
+ k: v * self.weight_dict[k]
596
+ for k, v in local_losses.items()
597
+ if k in self.weight_dict
598
+ }
599
+ )
600
+
601
+ num_aux_layers = self.backbone.num_decoder_layers
602
+ auxiliary_outputs_list = [
603
+ {
604
+ "logits": matching_intermediate_logits[:, i, :, :],
605
+ "pred_boxes": keras.ops.clip(
606
+ matching_intermediate_ref_points[:, i, :, :], 0, 1
607
+ ),
608
+ "pred_corners": matching_predicted_corners[:, i, :, :],
609
+ "ref_points": matching_initial_ref_points[:, i, :, :],
610
+ "teacher_corners": matching_predicted_corners[:, -1, :, :],
611
+ "teacher_logits": matching_intermediate_logits[:, -1, :, :],
612
+ }
613
+ for i in range(num_aux_layers)
614
+ ]
615
+ for i, aux_output in enumerate(auxiliary_outputs_list):
616
+ aux_indices = hungarian_matcher(
617
+ aux_output,
618
+ [targets],
619
+ num_targets_per_image,
620
+ self.use_focal_loss,
621
+ self.matcher_alpha,
622
+ self.matcher_gamma,
623
+ self.matcher_bbox_cost,
624
+ self.matcher_class_cost,
625
+ self.matcher_ciou_cost,
626
+ self.backbone,
627
+ )
628
+ aux_vfl_loss = compute_vfl_loss(
629
+ aux_output,
630
+ [targets],
631
+ aux_indices,
632
+ num_boxes,
633
+ self.num_classes,
634
+ self.matcher_alpha,
635
+ self.matcher_gamma,
636
+ )
637
+ aux_box_losses = compute_box_losses(
638
+ aux_output, [targets], aux_indices, num_boxes
639
+ )
640
+ is_not_last_aux_layer = i < len(auxiliary_outputs_list) - 1
641
+ aux_local_losses = compute_local_losses(
642
+ aux_output,
643
+ [targets],
644
+ aux_indices,
645
+ num_boxes,
646
+ self.backbone,
647
+ self.ddf_temperature,
648
+ compute_ddf=is_not_last_aux_layer,
649
+ )
650
+ aux_losses = {**aux_vfl_loss, **aux_box_losses, **aux_local_losses}
651
+ weighted_aux_losses = {
652
+ k + f"_aux_{i}": v * self.weight_dict[k]
653
+ for k, v in aux_losses.items()
654
+ if k in self.weight_dict
655
+ }
656
+ losses.update(weighted_aux_losses)
657
+ # Add encoder loss.
658
+ enc_output = {
659
+ "logits": enc_topk_logits,
660
+ "pred_boxes": keras.ops.clip(enc_topk_bboxes, 0, 1),
661
+ }
662
+ enc_indices = hungarian_matcher(
663
+ enc_output,
664
+ [targets],
665
+ num_targets_per_image,
666
+ self.use_focal_loss,
667
+ self.matcher_alpha,
668
+ self.matcher_gamma,
669
+ self.matcher_bbox_cost,
670
+ self.matcher_class_cost,
671
+ self.matcher_ciou_cost,
672
+ self.backbone,
673
+ )
674
+ enc_vfl_loss = compute_vfl_loss(
675
+ enc_output,
676
+ [targets],
677
+ enc_indices,
678
+ num_boxes,
679
+ self.num_classes,
680
+ self.matcher_alpha,
681
+ self.matcher_gamma,
682
+ )
683
+ enc_box_losses = compute_box_losses(
684
+ enc_output, [targets], enc_indices, num_boxes
685
+ )
686
+ enc_losses = {**enc_vfl_loss, **enc_box_losses}
687
+ weighted_enc_losses = {
688
+ k + "_enc": v * self.weight_dict[k]
689
+ for k, v in enc_losses.items()
690
+ if k in self.weight_dict
691
+ }
692
+ losses.update(weighted_enc_losses)
693
+
694
+ if denoising_meta_values is not None:
695
+ dn_indices = get_cdn_matched_indices(denoising_meta_values)
696
+ dn_num_group = denoising_meta_values["dn_num_group"][0]
697
+ num_boxes_dn = num_boxes * keras.ops.cast(dn_num_group, "float32")
698
+ num_dn_layers = self.backbone.num_decoder_layers + 1
699
+ for i in range(num_dn_layers):
700
+ is_not_last_layer = keras.ops.less(i, num_dn_layers - 1)
701
+ teacher_idx = num_dn_layers - 1
702
+ dn_aux_output = {
703
+ "logits": dn_intermediate_logits[:, i, :, :],
704
+ "pred_boxes": keras.ops.clip(
705
+ dn_intermediate_ref_points[:, i, :, :], 0, 1
706
+ ),
707
+ "pred_corners": dn_predicted_corners[:, i, :, :],
708
+ "ref_points": dn_initial_ref_points[:, i, :, :],
709
+ "teacher_corners": dn_predicted_corners[
710
+ :, teacher_idx, :, :
711
+ ],
712
+ "teacher_logits": dn_intermediate_logits[
713
+ :, teacher_idx, :, :
714
+ ],
715
+ }
716
+ vfl_loss = compute_vfl_loss(
717
+ dn_aux_output,
718
+ [targets],
719
+ dn_indices,
720
+ num_boxes_dn,
721
+ self.num_classes,
722
+ self.matcher_alpha,
723
+ self.matcher_gamma,
724
+ )
725
+ box_losses = compute_box_losses(
726
+ dn_aux_output, [targets], dn_indices, num_boxes_dn
727
+ )
728
+ local_losses = compute_local_losses(
729
+ dn_aux_output,
730
+ [targets],
731
+ dn_indices,
732
+ num_boxes_dn,
733
+ self.backbone,
734
+ self.ddf_temperature,
735
+ compute_ddf=is_not_last_layer,
736
+ )
737
+ all_losses = {**vfl_loss, **box_losses, **local_losses}
738
+ weighted_losses = {
739
+ k + f"_dn_{i}": v * self.weight_dict[k]
740
+ for k, v in all_losses.items()
741
+ if k in self.weight_dict
742
+ }
743
+ losses.update(weighted_losses)
744
+ total_loss = keras.ops.sum([v for v in losses.values()])
745
+ return total_loss
746
+
747
+ @property
748
+ def prediction_decoder(self):
749
+ return self._prediction_decoder
750
+
751
+ @prediction_decoder.setter
752
+ def prediction_decoder(self, prediction_decoder):
753
+ if prediction_decoder.bounding_box_format != self.bounding_box_format:
754
+ raise ValueError(
755
+ "Expected `prediction_decoder` and `DFineObjectDetector` to "
756
+ "use the same `bounding_box_format`, but got "
757
+ "`prediction_decoder.bounding_box_format="
758
+ f"{prediction_decoder.bounding_box_format}`, and "
759
+ "`self.bounding_box_format="
760
+ f"{self.bounding_box_format}`."
761
+ )
762
+ self._prediction_decoder = prediction_decoder
763
+ self.make_predict_function(force=True)
764
+ self.make_train_function(force=True)
765
+ self.make_test_function(force=True)
766
+
767
+ def decode_predictions(self, predictions, data):
768
+ """Decodes raw model predictions into final bounding boxes.
769
+
770
+ This method takes the raw output from the model (logits and normalized
771
+ bounding boxes in center format) and converts them into the final
772
+ detection format. The process involves:
773
+ 1. Denormalizing the bounding box coordinates to the original image
774
+ dimensions.
775
+ 2. Converting boxes from center format `(cx, cy, w, h)` to corner
776
+ format `(ymin, xmin, ymax, xmax)`.
777
+ 3. Applying non-maximum suppression to filter out overlapping boxes
778
+ and keep only the most confident detections.
779
+
780
+ Args:
781
+ predictions: dict, A dictionary of tensors from the model,
782
+ containing `"logits"` and `"pred_boxes"`.
783
+ data: tuple, The input data tuple, from which the original images
784
+ are extracted to obtain their dimensions for denormalization.
785
+
786
+ Returns:
787
+ Dictionary: Final predictions, containing `"boxes"`, `"labels"`,
788
+ `"confidence"`, and `"num_detections"`.
789
+ """
790
+ if isinstance(data, (list, tuple)):
791
+ images, _ = data
792
+ else:
793
+ images = data
794
+ logits = predictions["logits"]
795
+ pred_boxes = predictions["pred_boxes"]
796
+ height, width, _ = keras.ops.shape(images)[1:]
797
+ denormalized_boxes = keras.ops.stack(
798
+ [
799
+ pred_boxes[..., 0] * width, # center_x
800
+ pred_boxes[..., 1] * height, # center_y
801
+ pred_boxes[..., 2] * width, # width
802
+ pred_boxes[..., 3] * height, # height
803
+ ],
804
+ axis=-1,
805
+ )
806
+ pred_boxes_xyxy = keras.utils.bounding_boxes.convert_format(
807
+ denormalized_boxes,
808
+ source="center_xywh",
809
+ target="xyxy",
810
+ )
811
+ pred_boxes_yxyx = keras.ops.stack(
812
+ [
813
+ pred_boxes_xyxy[..., 1], # y_min
814
+ pred_boxes_xyxy[..., 0], # x_min
815
+ pred_boxes_xyxy[..., 3], # y_max
816
+ pred_boxes_xyxy[..., 2], # x_max
817
+ ],
818
+ axis=-1,
819
+ )
820
+ y_pred = self.prediction_decoder(pred_boxes_yxyx, logits, images=images)
821
+ return y_pred
822
+
823
+ def get_config(self):
824
+ config = super().get_config()
825
+ config.update(
826
+ {
827
+ "num_classes": self.num_classes,
828
+ "bounding_box_format": self.bounding_box_format,
829
+ "matcher_class_cost": self.matcher_class_cost,
830
+ "matcher_bbox_cost": self.matcher_bbox_cost,
831
+ "matcher_ciou_cost": self.matcher_ciou_cost,
832
+ "use_focal_loss": self.use_focal_loss,
833
+ "matcher_alpha": self.matcher_alpha,
834
+ "matcher_gamma": self.matcher_gamma,
835
+ "weight_loss_vfl": self.weight_dict["loss_vfl"],
836
+ "weight_loss_bbox": self.weight_dict["loss_bbox"],
837
+ "weight_loss_ciou": self.weight_dict["loss_ciou"],
838
+ "weight_loss_fgl": self.weight_dict["loss_fgl"],
839
+ "weight_loss_ddf": self.weight_dict["loss_ddf"],
840
+ "ddf_temperature": self.ddf_temperature,
841
+ "prediction_decoder": keras.saving.serialize_keras_object(
842
+ self._prediction_decoder
843
+ ),
844
+ }
845
+ )
846
+ return config
847
+
848
+ def predict_step(self, *args):
849
+ outputs = super().predict_step(*args)
850
+ if isinstance(outputs, tuple):
851
+ return self.decode_predictions(outputs[0], args[-1]), outputs[1]
852
+ return self.decode_predictions(outputs, *args)
853
+
854
+ @classmethod
855
+ def from_config(cls, config):
856
+ config = config.copy()
857
+ if "backbone" in config and isinstance(config["backbone"], dict):
858
+ config["backbone"] = keras.saving.deserialize_keras_object(
859
+ config["backbone"]
860
+ )
861
+ if "preprocessor" in config and isinstance(
862
+ config["preprocessor"], dict
863
+ ):
864
+ config["preprocessor"] = keras.saving.deserialize_keras_object(
865
+ config["preprocessor"]
866
+ )
867
+ if "prediction_decoder" in config and isinstance(
868
+ config["prediction_decoder"], dict
869
+ ):
870
+ config["prediction_decoder"] = (
871
+ keras.saving.deserialize_keras_object(
872
+ config["prediction_decoder"]
873
+ )
874
+ )
875
+ return cls(**config)