keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +6 -0
- keras_hub/models/__init__.py +21 -0
- keras_hub/src/layers/modeling/position_embedding.py +21 -6
- keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
- keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
- keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
- keras_hub/src/models/backbone.py +10 -15
- keras_hub/src/models/d_fine/__init__.py +0 -0
- keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
- keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
- keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
- keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
- keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
- keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
- keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
- keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
- keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
- keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
- keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
- keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
- keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
- keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
- keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
- keras_hub/src/models/parseq/__init__.py +0 -0
- keras_hub/src/models/parseq/parseq_backbone.py +134 -0
- keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
- keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
- keras_hub/src/models/parseq/parseq_decoder.py +418 -0
- keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
- keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
- keras_hub/src/tests/test_case.py +37 -1
- keras_hub/src/utils/preset_utils.py +49 -0
- keras_hub/src/utils/tensor_utils.py +23 -1
- keras_hub/src/utils/transformers/convert_vit.py +4 -1
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,875 @@
|
|
1
|
+
import keras
|
2
|
+
|
3
|
+
from keras_hub.src.api_export import keras_hub_export
|
4
|
+
from keras_hub.src.layers.modeling.non_max_supression import NonMaxSuppression
|
5
|
+
from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
|
6
|
+
from keras_hub.src.models.d_fine.d_fine_loss import compute_box_losses
|
7
|
+
from keras_hub.src.models.d_fine.d_fine_loss import compute_local_losses
|
8
|
+
from keras_hub.src.models.d_fine.d_fine_loss import compute_vfl_loss
|
9
|
+
from keras_hub.src.models.d_fine.d_fine_loss import get_cdn_matched_indices
|
10
|
+
from keras_hub.src.models.d_fine.d_fine_loss import hungarian_matcher
|
11
|
+
from keras_hub.src.models.d_fine.d_fine_object_detector_preprocessor import (
|
12
|
+
DFineObjectDetectorPreprocessor,
|
13
|
+
)
|
14
|
+
from keras_hub.src.models.object_detector import ObjectDetector
|
15
|
+
from keras_hub.src.utils.tensor_utils import assert_bounding_box_support
|
16
|
+
|
17
|
+
|
18
|
+
@keras_hub_export("keras_hub.models.DFineObjectDetector")
|
19
|
+
class DFineObjectDetector(ObjectDetector):
|
20
|
+
"""D-FINE Object Detector model.
|
21
|
+
|
22
|
+
This class wraps the `DFineBackbone` and adds the final prediction and loss
|
23
|
+
computation logic for end-to-end object detection. It is responsible for:
|
24
|
+
1. Defining the functional model that connects the `DFineBackbone` to the
|
25
|
+
input layers.
|
26
|
+
2. Implementing the `compute_loss` method, which uses a Hungarian matcher
|
27
|
+
to assign predictions to ground truth targets and calculates a weighted
|
28
|
+
sum of multiple loss components (classification, bounding box, etc.).
|
29
|
+
3. Post-processing the raw outputs from the backbone into final, decoded
|
30
|
+
predictions (boxes, labels, confidence scores) during inference.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
backbone: A `keras_hub.models.Backbone` instance, specifically a
|
34
|
+
`DFineBackbone`, serving as the feature extractor for the object
|
35
|
+
detector.
|
36
|
+
num_classes: An integer representing the number of object classes to
|
37
|
+
detect.
|
38
|
+
bounding_box_format: A string specifying the format of the bounding
|
39
|
+
boxes. Defaults to `"yxyx"`. Must be a supported format (e.g.,
|
40
|
+
`"yxyx"`, `"xyxy"`).
|
41
|
+
preprocessor: Optional. An instance of `DFineObjectDetectorPreprocessor`
|
42
|
+
for input data preprocessing.
|
43
|
+
matcher_class_cost: A float representing the cost for class mismatch in
|
44
|
+
the Hungarian matcher. Defaults to `2.0`.
|
45
|
+
matcher_bbox_cost: A float representing the cost for bounding box
|
46
|
+
mismatch in the Hungarian matcher. Defaults to `5.0`.
|
47
|
+
matcher_ciou_cost: A float representing the cost for complete IoU
|
48
|
+
mismatch in the Hungarian matcher. Defaults to `2.0`.
|
49
|
+
use_focal_loss: A boolean indicating whether to use focal loss for
|
50
|
+
classification. Defaults to `True`.
|
51
|
+
matcher_alpha: A float parameter for the focal loss alpha. Defaults to
|
52
|
+
`0.25`.
|
53
|
+
matcher_gamma: A float parameter for the focal loss gamma. Defaults to
|
54
|
+
`2.0`.
|
55
|
+
weight_loss_vfl: Weight for the classification loss. Defaults to `1.0`.
|
56
|
+
weight_loss_bbox: Weight for the bounding box regression loss. Default
|
57
|
+
is `5.0`.
|
58
|
+
weight_loss_ciou: Weight for the complete IoU loss. Defaults to
|
59
|
+
`2.0`.
|
60
|
+
weight_loss_fgl: Weight for the focal grid loss. Defaults to `0.15`.
|
61
|
+
weight_loss_ddf: Weight for the DDF loss. Defaults to `1.5`.
|
62
|
+
ddf_temperature: A float temperature scaling factor for the DDF loss.
|
63
|
+
Defaults to `5.0`.
|
64
|
+
prediction_decoder: Optional. A `keras.layers.Layer` instance that
|
65
|
+
decodes raw predictions. If not provided, a `NonMaxSuppression`
|
66
|
+
layer is used.
|
67
|
+
activation: Optional. The activation function to apply to the logits
|
68
|
+
before decoding. Defaults to `None`.
|
69
|
+
|
70
|
+
Examples:
|
71
|
+
|
72
|
+
**Creating a DFineObjectDetector without labels:**
|
73
|
+
|
74
|
+
```python
|
75
|
+
import numpy as np
|
76
|
+
from keras_hub.src.models.d_fine.d_fine_object_detector import (
|
77
|
+
DFineObjectDetector
|
78
|
+
)
|
79
|
+
from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
|
80
|
+
from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
|
81
|
+
|
82
|
+
# Initialize the backbone without labels.
|
83
|
+
hgnetv2_backbone = HGNetV2Backbone(
|
84
|
+
stem_channels=[3, 16, 16],
|
85
|
+
stackwise_stage_filters=[
|
86
|
+
[16, 16, 64, 1, 3, 3],
|
87
|
+
[64, 32, 256, 1, 3, 3],
|
88
|
+
[256, 64, 512, 2, 3, 5],
|
89
|
+
[512, 128, 1024, 1, 3, 5],
|
90
|
+
],
|
91
|
+
apply_downsample=[False, True, True, True],
|
92
|
+
use_lightweight_conv_block=[False, False, True, True],
|
93
|
+
depths=[1, 1, 2, 1],
|
94
|
+
hidden_sizes=[64, 256, 512, 1024],
|
95
|
+
embedding_size=16,
|
96
|
+
use_learnable_affine_block=True,
|
97
|
+
hidden_act="relu",
|
98
|
+
image_shape=(256, 256, 3),
|
99
|
+
out_features=["stage3", "stage4"],
|
100
|
+
)
|
101
|
+
|
102
|
+
# Initialize the backbone without labels.
|
103
|
+
backbone = DFineBackbone(
|
104
|
+
backbone=hgnetv2_backbone,
|
105
|
+
decoder_in_channels=[128, 128],
|
106
|
+
encoder_hidden_dim=128,
|
107
|
+
num_denoising=100,
|
108
|
+
num_labels=80,
|
109
|
+
hidden_dim=128,
|
110
|
+
learn_initial_query=False,
|
111
|
+
num_queries=300,
|
112
|
+
anchor_image_size=(256, 256),
|
113
|
+
feat_strides=[16, 32],
|
114
|
+
num_feature_levels=2,
|
115
|
+
encoder_in_channels=[512, 1024],
|
116
|
+
encode_proj_layers=[1],
|
117
|
+
num_attention_heads=8,
|
118
|
+
encoder_ffn_dim=512,
|
119
|
+
num_encoder_layers=1,
|
120
|
+
hidden_expansion=0.34,
|
121
|
+
depth_multiplier=0.5,
|
122
|
+
eval_idx=-1,
|
123
|
+
num_decoder_layers=3,
|
124
|
+
decoder_attention_heads=8,
|
125
|
+
decoder_ffn_dim=512,
|
126
|
+
decoder_n_points=[6, 6],
|
127
|
+
lqe_hidden_dim=64,
|
128
|
+
num_lqe_layers=2,
|
129
|
+
out_features=["stage3", "stage4"],
|
130
|
+
image_shape=(256, 256, 3),
|
131
|
+
)
|
132
|
+
|
133
|
+
# Create the detector.
|
134
|
+
detector = DFineObjectDetector(
|
135
|
+
backbone=backbone,
|
136
|
+
num_classes=80,
|
137
|
+
bounding_box_format="yxyx",
|
138
|
+
)
|
139
|
+
```
|
140
|
+
|
141
|
+
**Creating a DFineObjectDetector with labels for the backbone:**
|
142
|
+
|
143
|
+
```python
|
144
|
+
import numpy as np
|
145
|
+
from keras_hub.src.models.d_fine.d_fine_object_detector import (
|
146
|
+
DFineObjectDetector
|
147
|
+
)
|
148
|
+
from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
|
149
|
+
from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
|
150
|
+
|
151
|
+
# Define labels for the backbone.
|
152
|
+
labels = [
|
153
|
+
{
|
154
|
+
"boxes": np.array([[0.5, 0.5, 0.2, 0.2], [0.4, 0.4, 0.1, 0.1]]),
|
155
|
+
"labels": np.array([1, 10])
|
156
|
+
},
|
157
|
+
{"boxes": np.array([[0.6, 0.6, 0.3, 0.3]]), "labels": np.array([20])},
|
158
|
+
]
|
159
|
+
|
160
|
+
hgnetv2_backbone = HGNetV2Backbone(
|
161
|
+
stem_channels=[3, 16, 16],
|
162
|
+
stackwise_stage_filters=[
|
163
|
+
[16, 16, 64, 1, 3, 3],
|
164
|
+
[64, 32, 256, 1, 3, 3],
|
165
|
+
[256, 64, 512, 2, 3, 5],
|
166
|
+
[512, 128, 1024, 1, 3, 5],
|
167
|
+
],
|
168
|
+
apply_downsample=[False, True, True, True],
|
169
|
+
use_lightweight_conv_block=[False, False, True, True],
|
170
|
+
depths=[1, 1, 2, 1],
|
171
|
+
hidden_sizes=[64, 256, 512, 1024],
|
172
|
+
embedding_size=16,
|
173
|
+
use_learnable_affine_block=True,
|
174
|
+
hidden_act="relu",
|
175
|
+
image_shape=(256, 256, 3),
|
176
|
+
out_features=["stage3", "stage4"],
|
177
|
+
)
|
178
|
+
|
179
|
+
# Backbone is initialized with labels.
|
180
|
+
backbone = DFineBackbone(
|
181
|
+
backbone=hgnetv2_backbone,
|
182
|
+
decoder_in_channels=[128, 128],
|
183
|
+
encoder_hidden_dim=128,
|
184
|
+
num_denoising=100,
|
185
|
+
num_labels=80,
|
186
|
+
hidden_dim=128,
|
187
|
+
learn_initial_query=False,
|
188
|
+
num_queries=300,
|
189
|
+
anchor_image_size=(256, 256),
|
190
|
+
feat_strides=[16, 32],
|
191
|
+
num_feature_levels=2,
|
192
|
+
encoder_in_channels=[512, 1024],
|
193
|
+
encode_proj_layers=[1],
|
194
|
+
num_attention_heads=8,
|
195
|
+
encoder_ffn_dim=512,
|
196
|
+
num_encoder_layers=1,
|
197
|
+
hidden_expansion=0.34,
|
198
|
+
depth_multiplier=0.5,
|
199
|
+
eval_idx=-1,
|
200
|
+
num_decoder_layers=3,
|
201
|
+
decoder_attention_heads=8,
|
202
|
+
decoder_ffn_dim=512,
|
203
|
+
decoder_n_points=[6, 6],
|
204
|
+
lqe_hidden_dim=64,
|
205
|
+
num_lqe_layers=2,
|
206
|
+
out_features=["stage3", "stage4"],
|
207
|
+
image_shape=(256, 256, 3),
|
208
|
+
labels=labels,
|
209
|
+
box_noise_scale=1.0,
|
210
|
+
label_noise_ratio=0.5,
|
211
|
+
)
|
212
|
+
|
213
|
+
# Create the detector.
|
214
|
+
detector = DFineObjectDetector(
|
215
|
+
backbone=backbone,
|
216
|
+
num_classes=80,
|
217
|
+
bounding_box_format="yxyx",
|
218
|
+
)
|
219
|
+
```
|
220
|
+
|
221
|
+
**Using the detector for training:**
|
222
|
+
|
223
|
+
```python
|
224
|
+
import numpy as np
|
225
|
+
from keras_hub.src.models.d_fine.d_fine_object_detector import (
|
226
|
+
DFineObjectDetector
|
227
|
+
)
|
228
|
+
from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
|
229
|
+
from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
|
230
|
+
|
231
|
+
# Initialize backbone and detector.
|
232
|
+
hgnetv2_backbone = HGNetV2Backbone(
|
233
|
+
stem_channels=[3, 16, 16],
|
234
|
+
stackwise_stage_filters=[
|
235
|
+
[16, 16, 64, 1, 3, 3],
|
236
|
+
[64, 32, 256, 1, 3, 3],
|
237
|
+
[256, 64, 512, 2, 3, 5],
|
238
|
+
[512, 128, 1024, 1, 3, 5],
|
239
|
+
],
|
240
|
+
apply_downsample=[False, True, True, True],
|
241
|
+
use_lightweight_conv_block=[False, False, True, True],
|
242
|
+
depths=[1, 1, 2, 1],
|
243
|
+
hidden_sizes=[64, 256, 512, 1024],
|
244
|
+
embedding_size=16,
|
245
|
+
use_learnable_affine_block=True,
|
246
|
+
hidden_act="relu",
|
247
|
+
image_shape=(256, 256, 3),
|
248
|
+
out_features=["stage3", "stage4"],
|
249
|
+
)
|
250
|
+
backbone = DFineBackbone(
|
251
|
+
backbone=hgnetv2_backbone,
|
252
|
+
decoder_in_channels=[128, 128],
|
253
|
+
encoder_hidden_dim=128,
|
254
|
+
num_denoising=100,
|
255
|
+
num_labels=80,
|
256
|
+
hidden_dim=128,
|
257
|
+
learn_initial_query=False,
|
258
|
+
num_queries=300,
|
259
|
+
anchor_image_size=(256, 256),
|
260
|
+
feat_strides=[16, 32],
|
261
|
+
num_feature_levels=2,
|
262
|
+
encoder_in_channels=[512, 1024],
|
263
|
+
encode_proj_layers=[1],
|
264
|
+
num_attention_heads=8,
|
265
|
+
encoder_ffn_dim=512,
|
266
|
+
num_encoder_layers=1,
|
267
|
+
hidden_expansion=0.34,
|
268
|
+
depth_multiplier=0.5,
|
269
|
+
eval_idx=-1,
|
270
|
+
num_decoder_layers=3,
|
271
|
+
decoder_attention_heads=8,
|
272
|
+
decoder_ffn_dim=512,
|
273
|
+
decoder_n_points=[6, 6],
|
274
|
+
lqe_hidden_dim=64,
|
275
|
+
num_lqe_layers=2,
|
276
|
+
out_features=["stage3", "stage4"],
|
277
|
+
image_shape=(256, 256, 3),
|
278
|
+
)
|
279
|
+
detector = DFineObjectDetector(
|
280
|
+
backbone=backbone,
|
281
|
+
num_classes=80,
|
282
|
+
bounding_box_format="yxyx",
|
283
|
+
)
|
284
|
+
|
285
|
+
# Sample training data.
|
286
|
+
images = np.random.uniform(
|
287
|
+
low=0, high=255, size=(2, 256, 256, 3)
|
288
|
+
).astype("float32")
|
289
|
+
bounding_boxes = {
|
290
|
+
"boxes": [
|
291
|
+
np.array([[10.0, 20.0, 20.0, 30.0], [20.0, 30.0, 30.0, 40.0]]),
|
292
|
+
np.array([[15.0, 25.0, 25.0, 35.0]]),
|
293
|
+
],
|
294
|
+
"labels": [
|
295
|
+
np.array([0, 2]), np.array([1])
|
296
|
+
],
|
297
|
+
}
|
298
|
+
|
299
|
+
# Compile the model.
|
300
|
+
detector.compile(
|
301
|
+
optimizer="adam",
|
302
|
+
loss=detector.compute_loss,
|
303
|
+
)
|
304
|
+
|
305
|
+
# Train the model.
|
306
|
+
detector.fit(x=images, y=bounding_boxes, epochs=1, batch_size=1)
|
307
|
+
```
|
308
|
+
|
309
|
+
**Making predictions:**
|
310
|
+
|
311
|
+
```python
|
312
|
+
import numpy as np
|
313
|
+
from keras_hub.src.models.d_fine.d_fine_object_detector import (
|
314
|
+
DFineObjectDetector
|
315
|
+
)
|
316
|
+
from keras_hub.src.models.d_fine.d_fine_backbone import DFineBackbone
|
317
|
+
from keras_hub.src.models.hgnetv2.hgnetv2_backbone import HGNetV2Backbone
|
318
|
+
|
319
|
+
# Initialize backbone and detector.
|
320
|
+
hgnetv2_backbone = HGNetV2Backbone(
|
321
|
+
stem_channels=[3, 16, 16],
|
322
|
+
stackwise_stage_filters=[
|
323
|
+
[16, 16, 64, 1, 3, 3],
|
324
|
+
[64, 32, 256, 1, 3, 3],
|
325
|
+
[256, 64, 512, 2, 3, 5],
|
326
|
+
[512, 128, 1024, 1, 3, 5],
|
327
|
+
],
|
328
|
+
apply_downsample=[False, True, True, True],
|
329
|
+
use_lightweight_conv_block=[False, False, True, True],
|
330
|
+
depths=[1, 1, 2, 1],
|
331
|
+
hidden_sizes=[64, 256, 512, 1024],
|
332
|
+
embedding_size=16,
|
333
|
+
use_learnable_affine_block=True,
|
334
|
+
hidden_act="relu",
|
335
|
+
image_shape=(256, 256, 3),
|
336
|
+
out_features=["stage3", "stage4"],
|
337
|
+
)
|
338
|
+
backbone = DFineBackbone(
|
339
|
+
backbone=hgnetv2_backbone,
|
340
|
+
decoder_in_channels=[128, 128],
|
341
|
+
encoder_hidden_dim=128,
|
342
|
+
num_denoising=100,
|
343
|
+
num_labels=80,
|
344
|
+
hidden_dim=128,
|
345
|
+
learn_initial_query=False,
|
346
|
+
num_queries=300,
|
347
|
+
anchor_image_size=(256, 256),
|
348
|
+
feat_strides=[16, 32],
|
349
|
+
num_feature_levels=2,
|
350
|
+
encoder_in_channels=[512, 1024],
|
351
|
+
encode_proj_layers=[1],
|
352
|
+
num_attention_heads=8,
|
353
|
+
encoder_ffn_dim=512,
|
354
|
+
num_encoder_layers=1,
|
355
|
+
hidden_expansion=0.34,
|
356
|
+
depth_multiplier=0.5,
|
357
|
+
eval_idx=-1,
|
358
|
+
num_decoder_layers=3,
|
359
|
+
decoder_attention_heads=8,
|
360
|
+
decoder_ffn_dim=512,
|
361
|
+
decoder_n_points=[6, 6],
|
362
|
+
lqe_hidden_dim=64,
|
363
|
+
num_lqe_layers=2,
|
364
|
+
out_features=["stage3", "stage4"],
|
365
|
+
image_shape=(256, 256, 3),
|
366
|
+
)
|
367
|
+
detector = DFineObjectDetector(
|
368
|
+
backbone=backbone,
|
369
|
+
num_classes=80,
|
370
|
+
bounding_box_format="yxyx",
|
371
|
+
)
|
372
|
+
|
373
|
+
# Sample test image.
|
374
|
+
test_image = np.random.uniform(
|
375
|
+
low=0, high=255, size=(1, 256, 256, 3)
|
376
|
+
).astype("float32")
|
377
|
+
|
378
|
+
# Make predictions.
|
379
|
+
predictions = detector.predict(test_image)
|
380
|
+
|
381
|
+
# Access predictions.
|
382
|
+
boxes = predictions["boxes"] # Shape: (1, 100, 4)
|
383
|
+
labels = predictions["labels"] # Shape: (1, 100)
|
384
|
+
confidence = predictions["confidence"] # Shape: (1, 100)
|
385
|
+
num_detections = predictions["num_detections"] # Shape: (1,)
|
386
|
+
```
|
387
|
+
"""
|
388
|
+
|
389
|
+
backbone_cls = DFineBackbone
|
390
|
+
preprocessor_cls = DFineObjectDetectorPreprocessor
|
391
|
+
|
392
|
+
def __init__(
|
393
|
+
self,
|
394
|
+
backbone,
|
395
|
+
num_classes,
|
396
|
+
bounding_box_format="yxyx",
|
397
|
+
preprocessor=None,
|
398
|
+
matcher_class_cost=2.0,
|
399
|
+
matcher_bbox_cost=5.0,
|
400
|
+
matcher_ciou_cost=2.0,
|
401
|
+
use_focal_loss=True,
|
402
|
+
matcher_alpha=0.25,
|
403
|
+
matcher_gamma=2.0,
|
404
|
+
weight_loss_vfl=1.0,
|
405
|
+
weight_loss_bbox=5.0,
|
406
|
+
weight_loss_ciou=2.0,
|
407
|
+
weight_loss_fgl=0.15,
|
408
|
+
weight_loss_ddf=1.5,
|
409
|
+
ddf_temperature=5.0,
|
410
|
+
prediction_decoder=None,
|
411
|
+
activation=None,
|
412
|
+
**kwargs,
|
413
|
+
):
|
414
|
+
assert_bounding_box_support(self.__class__.__name__)
|
415
|
+
|
416
|
+
# === Functional Model ===
|
417
|
+
image_input = keras.layers.Input(
|
418
|
+
shape=backbone.image_shape, name="images"
|
419
|
+
)
|
420
|
+
outputs = backbone(image_input)
|
421
|
+
intermediate_logits = outputs["intermediate_logits"]
|
422
|
+
intermediate_reference_points = outputs["intermediate_reference_points"]
|
423
|
+
intermediate_predicted_corners = outputs[
|
424
|
+
"intermediate_predicted_corners"
|
425
|
+
]
|
426
|
+
initial_reference_points = outputs["initial_reference_points"]
|
427
|
+
logits = intermediate_logits[:, -1, :, :]
|
428
|
+
pred_boxes = intermediate_reference_points[:, -1, :, :]
|
429
|
+
model_outputs = {
|
430
|
+
"logits": logits,
|
431
|
+
"pred_boxes": pred_boxes,
|
432
|
+
"intermediate_logits": intermediate_logits,
|
433
|
+
"intermediate_reference_points": intermediate_reference_points,
|
434
|
+
"intermediate_predicted_corners": intermediate_predicted_corners,
|
435
|
+
"initial_reference_points": initial_reference_points,
|
436
|
+
"enc_topk_logits": outputs["enc_topk_logits"],
|
437
|
+
"enc_topk_bboxes": outputs["enc_topk_bboxes"],
|
438
|
+
}
|
439
|
+
if "dn_num_group" in outputs:
|
440
|
+
model_outputs["dn_positive_idx"] = outputs["dn_positive_idx"]
|
441
|
+
model_outputs["dn_num_group"] = outputs["dn_num_group"]
|
442
|
+
model_outputs["dn_num_split"] = outputs["dn_num_split"]
|
443
|
+
super().__init__(
|
444
|
+
inputs=image_input,
|
445
|
+
outputs=model_outputs,
|
446
|
+
**kwargs,
|
447
|
+
)
|
448
|
+
|
449
|
+
# === Config ===
|
450
|
+
self.backbone = backbone
|
451
|
+
self.num_classes = num_classes
|
452
|
+
self.bounding_box_format = bounding_box_format
|
453
|
+
self.preprocessor = preprocessor
|
454
|
+
self.matcher_class_cost = matcher_class_cost
|
455
|
+
self.matcher_bbox_cost = matcher_bbox_cost
|
456
|
+
self.matcher_ciou_cost = matcher_ciou_cost
|
457
|
+
self.use_focal_loss = use_focal_loss
|
458
|
+
self.matcher_alpha = matcher_alpha
|
459
|
+
self.matcher_gamma = matcher_gamma
|
460
|
+
self.weight_dict = {
|
461
|
+
"loss_vfl": weight_loss_vfl,
|
462
|
+
"loss_bbox": weight_loss_bbox,
|
463
|
+
"loss_ciou": weight_loss_ciou,
|
464
|
+
"loss_fgl": weight_loss_fgl,
|
465
|
+
"loss_ddf": weight_loss_ddf,
|
466
|
+
}
|
467
|
+
self.ddf_temperature = ddf_temperature
|
468
|
+
self.activation = activation
|
469
|
+
self._prediction_decoder = prediction_decoder or NonMaxSuppression(
|
470
|
+
from_logits=(self.activation != keras.activations.sigmoid),
|
471
|
+
bounding_box_format=self.bounding_box_format,
|
472
|
+
max_detections=backbone.num_queries,
|
473
|
+
)
|
474
|
+
|
475
|
+
def compute_loss(self, x, y, y_pred, sample_weight, **kwargs):
|
476
|
+
gt_boxes = y["boxes"]
|
477
|
+
gt_labels = y["labels"]
|
478
|
+
batch_size = keras.ops.shape(gt_labels)[0]
|
479
|
+
num_objects = keras.ops.shape(gt_labels)[1]
|
480
|
+
num_targets_per_image = keras.ops.tile(
|
481
|
+
keras.ops.expand_dims(num_objects, 0), [batch_size]
|
482
|
+
)
|
483
|
+
labels_for_item = keras.ops.reshape(gt_labels, [-1])
|
484
|
+
boxes_for_item = keras.ops.reshape(gt_boxes, [-1, 4])
|
485
|
+
targets = {"labels": labels_for_item, "boxes": boxes_for_item}
|
486
|
+
|
487
|
+
intermediate_logits_all = y_pred["intermediate_logits"]
|
488
|
+
intermediate_ref_points_all = y_pred["intermediate_reference_points"]
|
489
|
+
predicted_corners_all = y_pred["intermediate_predicted_corners"]
|
490
|
+
initial_ref_points_all = y_pred["initial_reference_points"]
|
491
|
+
enc_topk_logits = y_pred["enc_topk_logits"]
|
492
|
+
enc_topk_bboxes = y_pred["enc_topk_bboxes"]
|
493
|
+
if "dn_num_group" in y_pred:
|
494
|
+
denoising_meta_values = {
|
495
|
+
"dn_positive_idx": y_pred["dn_positive_idx"],
|
496
|
+
"dn_num_group": y_pred["dn_num_group"],
|
497
|
+
"dn_num_split": y_pred["dn_num_split"],
|
498
|
+
}
|
499
|
+
dn_split_point = self.backbone.dn_split_point
|
500
|
+
(
|
501
|
+
dn_intermediate_logits,
|
502
|
+
matching_intermediate_logits,
|
503
|
+
) = keras.ops.split(
|
504
|
+
intermediate_logits_all, [dn_split_point], axis=2
|
505
|
+
)
|
506
|
+
(
|
507
|
+
dn_intermediate_ref_points,
|
508
|
+
matching_intermediate_ref_points,
|
509
|
+
) = keras.ops.split(
|
510
|
+
intermediate_ref_points_all, [dn_split_point], axis=2
|
511
|
+
)
|
512
|
+
(
|
513
|
+
dn_predicted_corners,
|
514
|
+
matching_predicted_corners,
|
515
|
+
) = keras.ops.split(predicted_corners_all, [dn_split_point], axis=2)
|
516
|
+
(
|
517
|
+
dn_initial_ref_points,
|
518
|
+
matching_initial_ref_points,
|
519
|
+
) = keras.ops.split(
|
520
|
+
initial_ref_points_all, [dn_split_point], axis=2
|
521
|
+
)
|
522
|
+
else:
|
523
|
+
denoising_meta_values = None
|
524
|
+
matching_intermediate_logits = intermediate_logits_all
|
525
|
+
matching_intermediate_ref_points = intermediate_ref_points_all
|
526
|
+
matching_predicted_corners = predicted_corners_all
|
527
|
+
matching_initial_ref_points = initial_ref_points_all
|
528
|
+
matching_logits = matching_intermediate_logits[:, -1, :, :]
|
529
|
+
matching_pred_boxes = matching_intermediate_ref_points[:, -1, :, :]
|
530
|
+
outputs_without_aux = {
|
531
|
+
"logits": matching_logits,
|
532
|
+
"pred_boxes": keras.ops.clip(matching_pred_boxes, 0, 1),
|
533
|
+
}
|
534
|
+
indices = hungarian_matcher(
|
535
|
+
outputs_without_aux,
|
536
|
+
[targets],
|
537
|
+
num_targets_per_image,
|
538
|
+
self.use_focal_loss,
|
539
|
+
self.matcher_alpha,
|
540
|
+
self.matcher_gamma,
|
541
|
+
self.matcher_bbox_cost,
|
542
|
+
self.matcher_class_cost,
|
543
|
+
self.matcher_ciou_cost,
|
544
|
+
self.backbone,
|
545
|
+
)
|
546
|
+
num_boxes = keras.ops.shape(labels_for_item)[0]
|
547
|
+
num_boxes = keras.ops.convert_to_tensor(num_boxes, dtype="float32")
|
548
|
+
num_boxes = keras.ops.maximum(num_boxes, 1.0)
|
549
|
+
losses = {}
|
550
|
+
vfl_loss = compute_vfl_loss(
|
551
|
+
outputs_without_aux,
|
552
|
+
[targets],
|
553
|
+
indices,
|
554
|
+
num_boxes,
|
555
|
+
self.num_classes,
|
556
|
+
self.matcher_alpha,
|
557
|
+
self.matcher_gamma,
|
558
|
+
)
|
559
|
+
losses.update(
|
560
|
+
{
|
561
|
+
k: v * self.weight_dict[k]
|
562
|
+
for k, v in vfl_loss.items()
|
563
|
+
if k in self.weight_dict
|
564
|
+
}
|
565
|
+
)
|
566
|
+
box_losses = compute_box_losses(
|
567
|
+
outputs_without_aux, [targets], indices, num_boxes
|
568
|
+
)
|
569
|
+
losses.update(
|
570
|
+
{
|
571
|
+
k: v * self.weight_dict[k]
|
572
|
+
for k, v in box_losses.items()
|
573
|
+
if k in self.weight_dict
|
574
|
+
}
|
575
|
+
)
|
576
|
+
local_losses = compute_local_losses(
|
577
|
+
{
|
578
|
+
**outputs_without_aux,
|
579
|
+
"pred_corners": matching_predicted_corners[:, -1, :, :],
|
580
|
+
"ref_points": matching_initial_ref_points[:, -1, :, :],
|
581
|
+
"teacher_corners": keras.ops.zeros_like(
|
582
|
+
matching_predicted_corners[:, -1, :, :]
|
583
|
+
),
|
584
|
+
"teacher_logits": keras.ops.zeros_like(matching_logits),
|
585
|
+
},
|
586
|
+
[targets],
|
587
|
+
indices,
|
588
|
+
num_boxes,
|
589
|
+
self.backbone,
|
590
|
+
self.ddf_temperature,
|
591
|
+
compute_ddf=False,
|
592
|
+
)
|
593
|
+
losses.update(
|
594
|
+
{
|
595
|
+
k: v * self.weight_dict[k]
|
596
|
+
for k, v in local_losses.items()
|
597
|
+
if k in self.weight_dict
|
598
|
+
}
|
599
|
+
)
|
600
|
+
|
601
|
+
num_aux_layers = self.backbone.num_decoder_layers
|
602
|
+
auxiliary_outputs_list = [
|
603
|
+
{
|
604
|
+
"logits": matching_intermediate_logits[:, i, :, :],
|
605
|
+
"pred_boxes": keras.ops.clip(
|
606
|
+
matching_intermediate_ref_points[:, i, :, :], 0, 1
|
607
|
+
),
|
608
|
+
"pred_corners": matching_predicted_corners[:, i, :, :],
|
609
|
+
"ref_points": matching_initial_ref_points[:, i, :, :],
|
610
|
+
"teacher_corners": matching_predicted_corners[:, -1, :, :],
|
611
|
+
"teacher_logits": matching_intermediate_logits[:, -1, :, :],
|
612
|
+
}
|
613
|
+
for i in range(num_aux_layers)
|
614
|
+
]
|
615
|
+
for i, aux_output in enumerate(auxiliary_outputs_list):
|
616
|
+
aux_indices = hungarian_matcher(
|
617
|
+
aux_output,
|
618
|
+
[targets],
|
619
|
+
num_targets_per_image,
|
620
|
+
self.use_focal_loss,
|
621
|
+
self.matcher_alpha,
|
622
|
+
self.matcher_gamma,
|
623
|
+
self.matcher_bbox_cost,
|
624
|
+
self.matcher_class_cost,
|
625
|
+
self.matcher_ciou_cost,
|
626
|
+
self.backbone,
|
627
|
+
)
|
628
|
+
aux_vfl_loss = compute_vfl_loss(
|
629
|
+
aux_output,
|
630
|
+
[targets],
|
631
|
+
aux_indices,
|
632
|
+
num_boxes,
|
633
|
+
self.num_classes,
|
634
|
+
self.matcher_alpha,
|
635
|
+
self.matcher_gamma,
|
636
|
+
)
|
637
|
+
aux_box_losses = compute_box_losses(
|
638
|
+
aux_output, [targets], aux_indices, num_boxes
|
639
|
+
)
|
640
|
+
is_not_last_aux_layer = i < len(auxiliary_outputs_list) - 1
|
641
|
+
aux_local_losses = compute_local_losses(
|
642
|
+
aux_output,
|
643
|
+
[targets],
|
644
|
+
aux_indices,
|
645
|
+
num_boxes,
|
646
|
+
self.backbone,
|
647
|
+
self.ddf_temperature,
|
648
|
+
compute_ddf=is_not_last_aux_layer,
|
649
|
+
)
|
650
|
+
aux_losses = {**aux_vfl_loss, **aux_box_losses, **aux_local_losses}
|
651
|
+
weighted_aux_losses = {
|
652
|
+
k + f"_aux_{i}": v * self.weight_dict[k]
|
653
|
+
for k, v in aux_losses.items()
|
654
|
+
if k in self.weight_dict
|
655
|
+
}
|
656
|
+
losses.update(weighted_aux_losses)
|
657
|
+
# Add encoder loss.
|
658
|
+
enc_output = {
|
659
|
+
"logits": enc_topk_logits,
|
660
|
+
"pred_boxes": keras.ops.clip(enc_topk_bboxes, 0, 1),
|
661
|
+
}
|
662
|
+
enc_indices = hungarian_matcher(
|
663
|
+
enc_output,
|
664
|
+
[targets],
|
665
|
+
num_targets_per_image,
|
666
|
+
self.use_focal_loss,
|
667
|
+
self.matcher_alpha,
|
668
|
+
self.matcher_gamma,
|
669
|
+
self.matcher_bbox_cost,
|
670
|
+
self.matcher_class_cost,
|
671
|
+
self.matcher_ciou_cost,
|
672
|
+
self.backbone,
|
673
|
+
)
|
674
|
+
enc_vfl_loss = compute_vfl_loss(
|
675
|
+
enc_output,
|
676
|
+
[targets],
|
677
|
+
enc_indices,
|
678
|
+
num_boxes,
|
679
|
+
self.num_classes,
|
680
|
+
self.matcher_alpha,
|
681
|
+
self.matcher_gamma,
|
682
|
+
)
|
683
|
+
enc_box_losses = compute_box_losses(
|
684
|
+
enc_output, [targets], enc_indices, num_boxes
|
685
|
+
)
|
686
|
+
enc_losses = {**enc_vfl_loss, **enc_box_losses}
|
687
|
+
weighted_enc_losses = {
|
688
|
+
k + "_enc": v * self.weight_dict[k]
|
689
|
+
for k, v in enc_losses.items()
|
690
|
+
if k in self.weight_dict
|
691
|
+
}
|
692
|
+
losses.update(weighted_enc_losses)
|
693
|
+
|
694
|
+
if denoising_meta_values is not None:
|
695
|
+
dn_indices = get_cdn_matched_indices(denoising_meta_values)
|
696
|
+
dn_num_group = denoising_meta_values["dn_num_group"][0]
|
697
|
+
num_boxes_dn = num_boxes * keras.ops.cast(dn_num_group, "float32")
|
698
|
+
num_dn_layers = self.backbone.num_decoder_layers + 1
|
699
|
+
for i in range(num_dn_layers):
|
700
|
+
is_not_last_layer = keras.ops.less(i, num_dn_layers - 1)
|
701
|
+
teacher_idx = num_dn_layers - 1
|
702
|
+
dn_aux_output = {
|
703
|
+
"logits": dn_intermediate_logits[:, i, :, :],
|
704
|
+
"pred_boxes": keras.ops.clip(
|
705
|
+
dn_intermediate_ref_points[:, i, :, :], 0, 1
|
706
|
+
),
|
707
|
+
"pred_corners": dn_predicted_corners[:, i, :, :],
|
708
|
+
"ref_points": dn_initial_ref_points[:, i, :, :],
|
709
|
+
"teacher_corners": dn_predicted_corners[
|
710
|
+
:, teacher_idx, :, :
|
711
|
+
],
|
712
|
+
"teacher_logits": dn_intermediate_logits[
|
713
|
+
:, teacher_idx, :, :
|
714
|
+
],
|
715
|
+
}
|
716
|
+
vfl_loss = compute_vfl_loss(
|
717
|
+
dn_aux_output,
|
718
|
+
[targets],
|
719
|
+
dn_indices,
|
720
|
+
num_boxes_dn,
|
721
|
+
self.num_classes,
|
722
|
+
self.matcher_alpha,
|
723
|
+
self.matcher_gamma,
|
724
|
+
)
|
725
|
+
box_losses = compute_box_losses(
|
726
|
+
dn_aux_output, [targets], dn_indices, num_boxes_dn
|
727
|
+
)
|
728
|
+
local_losses = compute_local_losses(
|
729
|
+
dn_aux_output,
|
730
|
+
[targets],
|
731
|
+
dn_indices,
|
732
|
+
num_boxes_dn,
|
733
|
+
self.backbone,
|
734
|
+
self.ddf_temperature,
|
735
|
+
compute_ddf=is_not_last_layer,
|
736
|
+
)
|
737
|
+
all_losses = {**vfl_loss, **box_losses, **local_losses}
|
738
|
+
weighted_losses = {
|
739
|
+
k + f"_dn_{i}": v * self.weight_dict[k]
|
740
|
+
for k, v in all_losses.items()
|
741
|
+
if k in self.weight_dict
|
742
|
+
}
|
743
|
+
losses.update(weighted_losses)
|
744
|
+
total_loss = keras.ops.sum([v for v in losses.values()])
|
745
|
+
return total_loss
|
746
|
+
|
747
|
+
@property
|
748
|
+
def prediction_decoder(self):
|
749
|
+
return self._prediction_decoder
|
750
|
+
|
751
|
+
@prediction_decoder.setter
|
752
|
+
def prediction_decoder(self, prediction_decoder):
|
753
|
+
if prediction_decoder.bounding_box_format != self.bounding_box_format:
|
754
|
+
raise ValueError(
|
755
|
+
"Expected `prediction_decoder` and `DFineObjectDetector` to "
|
756
|
+
"use the same `bounding_box_format`, but got "
|
757
|
+
"`prediction_decoder.bounding_box_format="
|
758
|
+
f"{prediction_decoder.bounding_box_format}`, and "
|
759
|
+
"`self.bounding_box_format="
|
760
|
+
f"{self.bounding_box_format}`."
|
761
|
+
)
|
762
|
+
self._prediction_decoder = prediction_decoder
|
763
|
+
self.make_predict_function(force=True)
|
764
|
+
self.make_train_function(force=True)
|
765
|
+
self.make_test_function(force=True)
|
766
|
+
|
767
|
+
def decode_predictions(self, predictions, data):
|
768
|
+
"""Decodes raw model predictions into final bounding boxes.
|
769
|
+
|
770
|
+
This method takes the raw output from the model (logits and normalized
|
771
|
+
bounding boxes in center format) and converts them into the final
|
772
|
+
detection format. The process involves:
|
773
|
+
1. Denormalizing the bounding box coordinates to the original image
|
774
|
+
dimensions.
|
775
|
+
2. Converting boxes from center format `(cx, cy, w, h)` to corner
|
776
|
+
format `(ymin, xmin, ymax, xmax)`.
|
777
|
+
3. Applying non-maximum suppression to filter out overlapping boxes
|
778
|
+
and keep only the most confident detections.
|
779
|
+
|
780
|
+
Args:
|
781
|
+
predictions: dict, A dictionary of tensors from the model,
|
782
|
+
containing `"logits"` and `"pred_boxes"`.
|
783
|
+
data: tuple, The input data tuple, from which the original images
|
784
|
+
are extracted to obtain their dimensions for denormalization.
|
785
|
+
|
786
|
+
Returns:
|
787
|
+
Dictionary: Final predictions, containing `"boxes"`, `"labels"`,
|
788
|
+
`"confidence"`, and `"num_detections"`.
|
789
|
+
"""
|
790
|
+
if isinstance(data, (list, tuple)):
|
791
|
+
images, _ = data
|
792
|
+
else:
|
793
|
+
images = data
|
794
|
+
logits = predictions["logits"]
|
795
|
+
pred_boxes = predictions["pred_boxes"]
|
796
|
+
height, width, _ = keras.ops.shape(images)[1:]
|
797
|
+
denormalized_boxes = keras.ops.stack(
|
798
|
+
[
|
799
|
+
pred_boxes[..., 0] * width, # center_x
|
800
|
+
pred_boxes[..., 1] * height, # center_y
|
801
|
+
pred_boxes[..., 2] * width, # width
|
802
|
+
pred_boxes[..., 3] * height, # height
|
803
|
+
],
|
804
|
+
axis=-1,
|
805
|
+
)
|
806
|
+
pred_boxes_xyxy = keras.utils.bounding_boxes.convert_format(
|
807
|
+
denormalized_boxes,
|
808
|
+
source="center_xywh",
|
809
|
+
target="xyxy",
|
810
|
+
)
|
811
|
+
pred_boxes_yxyx = keras.ops.stack(
|
812
|
+
[
|
813
|
+
pred_boxes_xyxy[..., 1], # y_min
|
814
|
+
pred_boxes_xyxy[..., 0], # x_min
|
815
|
+
pred_boxes_xyxy[..., 3], # y_max
|
816
|
+
pred_boxes_xyxy[..., 2], # x_max
|
817
|
+
],
|
818
|
+
axis=-1,
|
819
|
+
)
|
820
|
+
y_pred = self.prediction_decoder(pred_boxes_yxyx, logits, images=images)
|
821
|
+
return y_pred
|
822
|
+
|
823
|
+
def get_config(self):
|
824
|
+
config = super().get_config()
|
825
|
+
config.update(
|
826
|
+
{
|
827
|
+
"num_classes": self.num_classes,
|
828
|
+
"bounding_box_format": self.bounding_box_format,
|
829
|
+
"matcher_class_cost": self.matcher_class_cost,
|
830
|
+
"matcher_bbox_cost": self.matcher_bbox_cost,
|
831
|
+
"matcher_ciou_cost": self.matcher_ciou_cost,
|
832
|
+
"use_focal_loss": self.use_focal_loss,
|
833
|
+
"matcher_alpha": self.matcher_alpha,
|
834
|
+
"matcher_gamma": self.matcher_gamma,
|
835
|
+
"weight_loss_vfl": self.weight_dict["loss_vfl"],
|
836
|
+
"weight_loss_bbox": self.weight_dict["loss_bbox"],
|
837
|
+
"weight_loss_ciou": self.weight_dict["loss_ciou"],
|
838
|
+
"weight_loss_fgl": self.weight_dict["loss_fgl"],
|
839
|
+
"weight_loss_ddf": self.weight_dict["loss_ddf"],
|
840
|
+
"ddf_temperature": self.ddf_temperature,
|
841
|
+
"prediction_decoder": keras.saving.serialize_keras_object(
|
842
|
+
self._prediction_decoder
|
843
|
+
),
|
844
|
+
}
|
845
|
+
)
|
846
|
+
return config
|
847
|
+
|
848
|
+
def predict_step(self, *args):
|
849
|
+
outputs = super().predict_step(*args)
|
850
|
+
if isinstance(outputs, tuple):
|
851
|
+
return self.decode_predictions(outputs[0], args[-1]), outputs[1]
|
852
|
+
return self.decode_predictions(outputs, *args)
|
853
|
+
|
854
|
+
@classmethod
|
855
|
+
def from_config(cls, config):
|
856
|
+
config = config.copy()
|
857
|
+
if "backbone" in config and isinstance(config["backbone"], dict):
|
858
|
+
config["backbone"] = keras.saving.deserialize_keras_object(
|
859
|
+
config["backbone"]
|
860
|
+
)
|
861
|
+
if "preprocessor" in config and isinstance(
|
862
|
+
config["preprocessor"], dict
|
863
|
+
):
|
864
|
+
config["preprocessor"] = keras.saving.deserialize_keras_object(
|
865
|
+
config["preprocessor"]
|
866
|
+
)
|
867
|
+
if "prediction_decoder" in config and isinstance(
|
868
|
+
config["prediction_decoder"], dict
|
869
|
+
):
|
870
|
+
config["prediction_decoder"] = (
|
871
|
+
keras.saving.deserialize_keras_object(
|
872
|
+
config["prediction_decoder"]
|
873
|
+
)
|
874
|
+
)
|
875
|
+
return cls(**config)
|