keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +6 -0
- keras_hub/models/__init__.py +21 -0
- keras_hub/src/layers/modeling/position_embedding.py +21 -6
- keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
- keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
- keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
- keras_hub/src/models/backbone.py +10 -15
- keras_hub/src/models/d_fine/__init__.py +0 -0
- keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
- keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
- keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
- keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
- keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
- keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
- keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
- keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
- keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
- keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
- keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
- keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
- keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
- keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
- keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
- keras_hub/src/models/parseq/__init__.py +0 -0
- keras_hub/src/models/parseq/parseq_backbone.py +134 -0
- keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
- keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
- keras_hub/src/models/parseq/parseq_decoder.py +418 -0
- keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
- keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
- keras_hub/src/tests/test_case.py +37 -1
- keras_hub/src/utils/preset_utils.py +49 -0
- keras_hub/src/utils/tensor_utils.py +23 -1
- keras_hub/src/utils/transformers/convert_vit.py +4 -1
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,891 @@
|
|
1
|
+
import math
|
2
|
+
|
3
|
+
import keras
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from keras_hub.src.api_export import keras_hub_export
|
7
|
+
from keras_hub.src.models.backbone import Backbone
|
8
|
+
from keras_hub.src.models.d_fine.d_fine_decoder import DFineDecoder
|
9
|
+
from keras_hub.src.models.d_fine.d_fine_hybrid_encoder import DFineHybridEncoder
|
10
|
+
from keras_hub.src.models.d_fine.d_fine_layers import DFineAnchorGenerator
|
11
|
+
from keras_hub.src.models.d_fine.d_fine_layers import (
|
12
|
+
DFineContrastiveDenoisingGroupGenerator,
|
13
|
+
)
|
14
|
+
from keras_hub.src.models.d_fine.d_fine_layers import (
|
15
|
+
DFineInitialQueryAndReferenceGenerator,
|
16
|
+
)
|
17
|
+
from keras_hub.src.models.d_fine.d_fine_layers import DFineMLPPredictionHead
|
18
|
+
from keras_hub.src.models.d_fine.d_fine_layers import DFineSourceFlattener
|
19
|
+
from keras_hub.src.models.d_fine.d_fine_layers import (
|
20
|
+
DFineSpatialShapesExtractor,
|
21
|
+
)
|
22
|
+
from keras_hub.src.models.d_fine.d_fine_utils import d_fine_kernel_initializer
|
23
|
+
from keras_hub.src.utils.keras_utils import standardize_data_format
|
24
|
+
|
25
|
+
|
26
|
+
class DFineDenoisingPreprocessorLayer(keras.layers.Layer):
|
27
|
+
"""Processes and prepares tensors for contrastive denoising.
|
28
|
+
|
29
|
+
This layer is a helper used within the `DFineBackbone`'s functional model
|
30
|
+
definition. Its primary role is to take the outputs from the
|
31
|
+
`DFineContrastiveDenoisingGroupGenerator` and prepare them for the dynamic,
|
32
|
+
per-batch forward pass, mostly since this functionality cannot be integrated
|
33
|
+
directly into the `DFineBackbone` in the symbolic forward pass.
|
34
|
+
|
35
|
+
The layer takes a tuple of `(pixel_values, input_query_class,
|
36
|
+
denoising_bbox_unact, attention_mask)` and an optional
|
37
|
+
`denoising_meta_values` dictionary as input to its `call` method.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def __init__(self, dtype=None, **kwargs):
|
41
|
+
super().__init__(dtype=dtype, **kwargs)
|
42
|
+
|
43
|
+
def call(self, inputs, denoising_meta_values=None):
|
44
|
+
(
|
45
|
+
pixel_values,
|
46
|
+
input_query_class,
|
47
|
+
denoising_bbox_unact,
|
48
|
+
attention_mask,
|
49
|
+
) = inputs
|
50
|
+
input_query_class_tensor = keras.ops.convert_to_tensor(
|
51
|
+
input_query_class, dtype="int32"
|
52
|
+
)
|
53
|
+
denoising_bbox_unact_tensor = keras.ops.convert_to_tensor(
|
54
|
+
denoising_bbox_unact, dtype=self.compute_dtype
|
55
|
+
)
|
56
|
+
attention_mask_tensor = keras.ops.convert_to_tensor(
|
57
|
+
attention_mask, dtype=self.compute_dtype
|
58
|
+
)
|
59
|
+
outputs = {
|
60
|
+
"input_query_class": input_query_class_tensor,
|
61
|
+
"denoising_bbox_unact": denoising_bbox_unact_tensor,
|
62
|
+
"attention_mask": attention_mask_tensor,
|
63
|
+
}
|
64
|
+
|
65
|
+
if denoising_meta_values is not None:
|
66
|
+
batch_size = keras.ops.shape(pixel_values)[0]
|
67
|
+
dn_positive_idx = denoising_meta_values["dn_positive_idx"]
|
68
|
+
c_batch_size = keras.ops.shape(dn_positive_idx)[0]
|
69
|
+
if c_batch_size == 0:
|
70
|
+
outputs["dn_positive_idx"] = keras.ops.zeros(
|
71
|
+
(batch_size,) + keras.ops.shape(dn_positive_idx)[1:],
|
72
|
+
dtype=dn_positive_idx.dtype,
|
73
|
+
)
|
74
|
+
else:
|
75
|
+
num_repeats = (batch_size + c_batch_size - 1) // c_batch_size
|
76
|
+
dn_positive_idx_tiled = keras.ops.tile(
|
77
|
+
dn_positive_idx,
|
78
|
+
(num_repeats,)
|
79
|
+
+ (1,) * (keras.ops.ndim(dn_positive_idx) - 1),
|
80
|
+
)
|
81
|
+
outputs["dn_positive_idx"] = dn_positive_idx_tiled[:batch_size]
|
82
|
+
dn_num_group = denoising_meta_values["dn_num_group"]
|
83
|
+
outputs["dn_num_group"] = keras.ops.tile(
|
84
|
+
keras.ops.expand_dims(dn_num_group, 0), (batch_size,)
|
85
|
+
)
|
86
|
+
dn_num_split = denoising_meta_values["dn_num_split"]
|
87
|
+
outputs["dn_num_split"] = keras.ops.tile(
|
88
|
+
keras.ops.expand_dims(dn_num_split, 0), (batch_size, 1)
|
89
|
+
)
|
90
|
+
|
91
|
+
return outputs
|
92
|
+
|
93
|
+
|
94
|
+
@keras_hub_export("keras_hub.models.DFineBackbone")
|
95
|
+
class DFineBackbone(Backbone):
|
96
|
+
"""D-FINE Backbone for Object Detection.
|
97
|
+
|
98
|
+
This class implements the core D-FINE architecture, which serves as the
|
99
|
+
backbone for `DFineObjectDetector`. It integrates a `HGNetV2Backbone` for
|
100
|
+
initial feature extraction, a `DFineHybridEncoder` for multi-scale feature
|
101
|
+
fusion using FPN/PAN pathways, and a `DFineDecoder` for refining object
|
102
|
+
queries.
|
103
|
+
|
104
|
+
The backbone orchestrates the entire forward pass, from processing raw
|
105
|
+
pixels to generating intermediate predictions. Key steps include:
|
106
|
+
1. Extracting multi-scale feature maps using the HGNetV2 backbone.
|
107
|
+
2. Fusing these features with the hybrid encoder.
|
108
|
+
3. Generating anchor proposals and selecting the top-k to initialize
|
109
|
+
decoder queries and reference points.
|
110
|
+
4. Generating noisy queries for contrastive denoising (if the `labels`
|
111
|
+
argument is provided).
|
112
|
+
5. Passing the queries and fused features through the transformer decoder
|
113
|
+
to produce iterative predictions for bounding boxes and class logits.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
backbone: A `keras.Model` instance that serves as the feature extractor.
|
117
|
+
While any `keras.Model` can be used, we highly recommend using a
|
118
|
+
`keras_hub.models.HGNetV2Backbone` instance, as this architecture is
|
119
|
+
optimized for its outputs. If a custom backbone is provided, it
|
120
|
+
must have a `stage_names` attribute, or the `out_features` argument
|
121
|
+
for this model must be specified. This requirement helps prevent
|
122
|
+
hard-to-debug downstream dimensionality errors.
|
123
|
+
decoder_in_channels: list, Channel dimensions of the multi-scale
|
124
|
+
features from the hybrid encoder. This should typically be a list
|
125
|
+
of `encoder_hidden_dim` repeated for each feature level.
|
126
|
+
encoder_hidden_dim: int, Hidden dimension size for the encoder layers.
|
127
|
+
num_labels: int, Number of object classes for detection.
|
128
|
+
num_denoising: int, Number of denoising queries for contrastive
|
129
|
+
denoising training. Set to `0` to disable denoising.
|
130
|
+
learn_initial_query: bool, Whether to learn initial query embeddings.
|
131
|
+
num_queries: int, Number of object queries for detection.
|
132
|
+
anchor_image_size: tuple, Size of the anchor image as `(height, width)`.
|
133
|
+
feat_strides: list, List of feature stride values for different pyramid
|
134
|
+
levels.
|
135
|
+
num_feature_levels: int, Number of feature pyramid levels to use.
|
136
|
+
hidden_dim: int, Hidden dimension size for the model.
|
137
|
+
encoder_in_channels: list, Channel dimensions of the feature maps from
|
138
|
+
the backbone (`HGNetV2Backbone`) that are fed into the hybrid
|
139
|
+
encoder.
|
140
|
+
encode_proj_layers: list, List specifying projection layer
|
141
|
+
configurations.
|
142
|
+
num_attention_heads: int, Number of attention heads in encoder layers.
|
143
|
+
encoder_ffn_dim: int, Feed-forward network dimension in encoder.
|
144
|
+
num_encoder_layers: int, Number of encoder layers.
|
145
|
+
hidden_expansion: float, Hidden dimension expansion factor.
|
146
|
+
depth_multiplier: float, Depth multiplier for the backbone.
|
147
|
+
eval_idx: int, Index for evaluation. Defaults to `-1` for the last
|
148
|
+
layer.
|
149
|
+
num_decoder_layers: int, Number of decoder layers.
|
150
|
+
decoder_attention_heads: int, Number of attention heads in decoder
|
151
|
+
layers.
|
152
|
+
decoder_ffn_dim: int, Feed-forward network dimension in decoder.
|
153
|
+
decoder_method: str, Decoder method. Can be either `"default"` or
|
154
|
+
`"discrete"`. Defaults to `"default"`.
|
155
|
+
decoder_n_points: list, Number of sampling points for deformable
|
156
|
+
attention.
|
157
|
+
lqe_hidden_dim: int, Hidden dimension for learned query embedding.
|
158
|
+
num_lqe_layers: int, Number of layers in learned query embedding.
|
159
|
+
label_noise_ratio: float, Ratio of label noise for denoising
|
160
|
+
training. Defaults to `0.5`.
|
161
|
+
box_noise_scale: float, Scale factor for box noise in denoising
|
162
|
+
training. Defaults to `1.0`.
|
163
|
+
labels: list or None, Ground truth labels for denoising training. This
|
164
|
+
is passed during model initialization to construct the training
|
165
|
+
graph for contrastive denoising. Each element should be a
|
166
|
+
dictionary with `"boxes"` (numpy array of shape `[N, 4]` with
|
167
|
+
normalized coordinates) and `"labels"` (numpy array of shape `[N]`
|
168
|
+
with class indices). Required when `num_denoising > 0`. Defaults to
|
169
|
+
`None`.
|
170
|
+
seed: int or None, Random seed for reproducibility. Defaults to `None`.
|
171
|
+
image_shape: tuple, Shape of input images as `(height, width,
|
172
|
+
channels)`. Height and width can be `None` for variable input sizes.
|
173
|
+
Defaults to `(None, None, 3)`.
|
174
|
+
out_features: list or None, List of feature names to output from
|
175
|
+
backbone. If `None`, uses the last `len(decoder_in_channels)`
|
176
|
+
features from the backbone's `stage_names`. Defaults to `None`.
|
177
|
+
data_format: str, The data format of the image channels. Can be either
|
178
|
+
`"channels_first"` or `"channels_last"`. If `None` is specified,
|
179
|
+
it will use the `image_data_format` value found in your Keras
|
180
|
+
config file at `~/.keras/keras.json`. Defaults to `None`.
|
181
|
+
dtype: `None` or str or `keras.mixed_precision.DTypePolicy`. The dtype
|
182
|
+
to use for the model's computations and weights. Defaults to `None`.
|
183
|
+
**kwargs: Additional keyword arguments passed to the base class.
|
184
|
+
|
185
|
+
Example:
|
186
|
+
```python
|
187
|
+
import keras
|
188
|
+
import numpy as np
|
189
|
+
from keras_hub.models import DFineBackbone
|
190
|
+
from keras_hub.models import HGNetV2Backbone
|
191
|
+
|
192
|
+
# Example 1: Basic usage without denoising.
|
193
|
+
# First, build the `HGNetV2Backbone` instance.
|
194
|
+
hgnetv2 = HGNetV2Backbone(
|
195
|
+
stem_channels=[3, 16, 16],
|
196
|
+
stackwise_stage_filters=[
|
197
|
+
[16, 16, 64, 1, 3, 3],
|
198
|
+
[64, 32, 256, 1, 3, 3],
|
199
|
+
[256, 64, 512, 2, 3, 5],
|
200
|
+
[512, 128, 1024, 1, 3, 5],
|
201
|
+
],
|
202
|
+
apply_downsample=[False, True, True, True],
|
203
|
+
use_lightweight_conv_block=[False, False, True, True],
|
204
|
+
depths=[1, 1, 2, 1],
|
205
|
+
hidden_sizes=[64, 256, 512, 1024],
|
206
|
+
embedding_size=16,
|
207
|
+
use_learnable_affine_block=True,
|
208
|
+
hidden_act="relu",
|
209
|
+
image_shape=(None, None, 3),
|
210
|
+
out_features=["stage3", "stage4"],
|
211
|
+
data_format="channels_last",
|
212
|
+
)
|
213
|
+
|
214
|
+
# Then, pass the backbone instance to `DFineBackbone`.
|
215
|
+
backbone = DFineBackbone(
|
216
|
+
backbone=hgnetv2,
|
217
|
+
decoder_in_channels=[128, 128],
|
218
|
+
encoder_hidden_dim=128,
|
219
|
+
num_denoising=0, # Disable denoising
|
220
|
+
num_labels=80,
|
221
|
+
hidden_dim=128,
|
222
|
+
learn_initial_query=False,
|
223
|
+
num_queries=300,
|
224
|
+
anchor_image_size=(256, 256),
|
225
|
+
feat_strides=[16, 32],
|
226
|
+
num_feature_levels=2,
|
227
|
+
encoder_in_channels=[512, 1024],
|
228
|
+
encode_proj_layers=[1],
|
229
|
+
num_attention_heads=8,
|
230
|
+
encoder_ffn_dim=512,
|
231
|
+
num_encoder_layers=1,
|
232
|
+
hidden_expansion=0.34,
|
233
|
+
depth_multiplier=0.5,
|
234
|
+
eval_idx=-1,
|
235
|
+
num_decoder_layers=3,
|
236
|
+
decoder_attention_heads=8,
|
237
|
+
decoder_ffn_dim=512,
|
238
|
+
decoder_n_points=[6, 6],
|
239
|
+
lqe_hidden_dim=64,
|
240
|
+
num_lqe_layers=2,
|
241
|
+
out_features=["stage3", "stage4"],
|
242
|
+
image_shape=(None, None, 3),
|
243
|
+
data_format="channels_last",
|
244
|
+
seed=0,
|
245
|
+
)
|
246
|
+
|
247
|
+
# Prepare input data.
|
248
|
+
input_data = keras.random.uniform((2, 256, 256, 3))
|
249
|
+
|
250
|
+
# Forward pass.
|
251
|
+
outputs = backbone(input_data)
|
252
|
+
|
253
|
+
# Example 2: With contrastive denoising training.
|
254
|
+
labels = [
|
255
|
+
{
|
256
|
+
"boxes": np.array([[0.5, 0.5, 0.2, 0.2], [0.4, 0.4, 0.1, 0.1]]),
|
257
|
+
"labels": np.array([1, 10]),
|
258
|
+
},
|
259
|
+
{
|
260
|
+
"boxes": np.array([[0.6, 0.6, 0.3, 0.3]]),
|
261
|
+
"labels": np.array([20]),
|
262
|
+
},
|
263
|
+
]
|
264
|
+
|
265
|
+
# Pass the `HGNetV2Backbone` instance to `DFineBackbone`.
|
266
|
+
backbone_with_denoising = DFineBackbone(
|
267
|
+
backbone=hgnetv2,
|
268
|
+
decoder_in_channels=[128, 128],
|
269
|
+
encoder_hidden_dim=128,
|
270
|
+
num_denoising=100, # Enable denoising
|
271
|
+
num_labels=80,
|
272
|
+
hidden_dim=128,
|
273
|
+
learn_initial_query=False,
|
274
|
+
num_queries=300,
|
275
|
+
anchor_image_size=(256, 256),
|
276
|
+
feat_strides=[16, 32],
|
277
|
+
num_feature_levels=2,
|
278
|
+
encoder_in_channels=[512, 1024],
|
279
|
+
encode_proj_layers=[1],
|
280
|
+
num_attention_heads=8,
|
281
|
+
encoder_ffn_dim=512,
|
282
|
+
num_encoder_layers=1,
|
283
|
+
hidden_expansion=0.34,
|
284
|
+
depth_multiplier=0.5,
|
285
|
+
eval_idx=-1,
|
286
|
+
num_decoder_layers=3,
|
287
|
+
decoder_attention_heads=8,
|
288
|
+
decoder_ffn_dim=512,
|
289
|
+
decoder_n_points=[6, 6],
|
290
|
+
lqe_hidden_dim=64,
|
291
|
+
num_lqe_layers=2,
|
292
|
+
out_features=["stage3", "stage4"],
|
293
|
+
image_shape=(None, None, 3),
|
294
|
+
seed=0,
|
295
|
+
labels=labels,
|
296
|
+
)
|
297
|
+
|
298
|
+
# Forward pass with denoising.
|
299
|
+
outputs_with_denoising = backbone_with_denoising(input_data)
|
300
|
+
```
|
301
|
+
"""
|
302
|
+
|
303
|
+
def __init__(
|
304
|
+
self,
|
305
|
+
backbone,
|
306
|
+
decoder_in_channels,
|
307
|
+
encoder_hidden_dim,
|
308
|
+
num_labels,
|
309
|
+
num_denoising,
|
310
|
+
learn_initial_query,
|
311
|
+
num_queries,
|
312
|
+
anchor_image_size,
|
313
|
+
feat_strides,
|
314
|
+
num_feature_levels,
|
315
|
+
hidden_dim,
|
316
|
+
encoder_in_channels,
|
317
|
+
encode_proj_layers,
|
318
|
+
num_attention_heads,
|
319
|
+
encoder_ffn_dim,
|
320
|
+
num_encoder_layers,
|
321
|
+
hidden_expansion,
|
322
|
+
depth_multiplier,
|
323
|
+
eval_idx,
|
324
|
+
num_decoder_layers,
|
325
|
+
decoder_attention_heads,
|
326
|
+
decoder_ffn_dim,
|
327
|
+
decoder_n_points,
|
328
|
+
lqe_hidden_dim,
|
329
|
+
num_lqe_layers,
|
330
|
+
decoder_method="default",
|
331
|
+
label_noise_ratio=0.5,
|
332
|
+
box_noise_scale=1.0,
|
333
|
+
labels=None,
|
334
|
+
seed=None,
|
335
|
+
image_shape=(None, None, 3),
|
336
|
+
out_features=None,
|
337
|
+
data_format=None,
|
338
|
+
dtype=None,
|
339
|
+
**kwargs,
|
340
|
+
):
|
341
|
+
if decoder_method not in ["default", "discrete"]:
|
342
|
+
decoder_method = "default"
|
343
|
+
data_format = standardize_data_format(data_format)
|
344
|
+
channel_axis = -1 if data_format == "channels_last" else 1
|
345
|
+
self.backbone = backbone
|
346
|
+
# Re-instantiate the backbone if its data_format mismatches the parents.
|
347
|
+
if (
|
348
|
+
hasattr(self.backbone, "data_format")
|
349
|
+
and self.backbone.data_format != data_format
|
350
|
+
):
|
351
|
+
backbone_config = self.backbone.get_config()
|
352
|
+
backbone_config["data_format"] = data_format
|
353
|
+
if (
|
354
|
+
"image_shape" in backbone_config
|
355
|
+
and backbone_config["image_shape"] is not None
|
356
|
+
and len(backbone_config["image_shape"]) == 3
|
357
|
+
):
|
358
|
+
backbone_config["image_shape"] = tuple(
|
359
|
+
reversed(backbone_config["image_shape"])
|
360
|
+
)
|
361
|
+
self.backbone = self.backbone.__class__.from_config(backbone_config)
|
362
|
+
spatial_shapes = []
|
363
|
+
for s in feat_strides:
|
364
|
+
h = anchor_image_size[0] // s
|
365
|
+
w = anchor_image_size[1] // s
|
366
|
+
spatial_shapes.append((h, w))
|
367
|
+
# NOTE: While `HGNetV2Backbone` is handled automatically, `out_features`
|
368
|
+
# must be specified for custom backbones. This design choice prevents
|
369
|
+
# hard-to-debug dimension mismatches by placing the onus on the user for
|
370
|
+
# ensuring compatibility.
|
371
|
+
if not hasattr(self.backbone, "stage_names") and out_features is None:
|
372
|
+
raise ValueError(
|
373
|
+
"`out_features` must be specified when using a custom "
|
374
|
+
"backbone that does not have a `stage_names` attribute."
|
375
|
+
)
|
376
|
+
stage_names = getattr(self.backbone, "stage_names", out_features)
|
377
|
+
out_features = (
|
378
|
+
out_features
|
379
|
+
if out_features is not None
|
380
|
+
else stage_names[-len(decoder_in_channels) :]
|
381
|
+
)
|
382
|
+
initializer = d_fine_kernel_initializer(
|
383
|
+
initializer_range=0.01,
|
384
|
+
)
|
385
|
+
|
386
|
+
# === Layers ===
|
387
|
+
self.encoder = DFineHybridEncoder(
|
388
|
+
encoder_in_channels=encoder_in_channels,
|
389
|
+
feat_strides=feat_strides,
|
390
|
+
encoder_hidden_dim=encoder_hidden_dim,
|
391
|
+
encode_proj_layers=encode_proj_layers,
|
392
|
+
positional_encoding_temperature=10000,
|
393
|
+
eval_size=None,
|
394
|
+
normalize_before=False,
|
395
|
+
num_attention_heads=num_attention_heads,
|
396
|
+
dropout=0.0,
|
397
|
+
layer_norm_eps=1e-5,
|
398
|
+
encoder_activation_function="gelu",
|
399
|
+
activation_dropout=0.0,
|
400
|
+
encoder_ffn_dim=encoder_ffn_dim,
|
401
|
+
num_encoder_layers=num_encoder_layers,
|
402
|
+
batch_norm_eps=1e-5,
|
403
|
+
hidden_expansion=hidden_expansion,
|
404
|
+
depth_multiplier=depth_multiplier,
|
405
|
+
kernel_initializer=initializer,
|
406
|
+
bias_initializer="zeros",
|
407
|
+
channel_axis=channel_axis,
|
408
|
+
data_format=data_format,
|
409
|
+
dtype=dtype,
|
410
|
+
name="hybrid_encoder",
|
411
|
+
)
|
412
|
+
self.decoder = DFineDecoder(
|
413
|
+
layer_scale=1.0,
|
414
|
+
eval_idx=eval_idx,
|
415
|
+
num_decoder_layers=num_decoder_layers,
|
416
|
+
dropout=0.0,
|
417
|
+
hidden_dim=hidden_dim,
|
418
|
+
reg_scale=4.0,
|
419
|
+
max_num_bins=32,
|
420
|
+
upsampling_factor=0.5,
|
421
|
+
decoder_attention_heads=decoder_attention_heads,
|
422
|
+
attention_dropout=0.0,
|
423
|
+
decoder_activation_function="relu",
|
424
|
+
activation_dropout=0.0,
|
425
|
+
layer_norm_eps=1e-5,
|
426
|
+
decoder_ffn_dim=decoder_ffn_dim,
|
427
|
+
num_feature_levels=num_feature_levels,
|
428
|
+
decoder_offset_scale=0.5,
|
429
|
+
decoder_method=decoder_method,
|
430
|
+
decoder_n_points=decoder_n_points,
|
431
|
+
top_prob_values=4,
|
432
|
+
lqe_hidden_dim=lqe_hidden_dim,
|
433
|
+
num_lqe_layers=num_lqe_layers,
|
434
|
+
num_labels=num_labels,
|
435
|
+
spatial_shapes=spatial_shapes,
|
436
|
+
dtype=dtype,
|
437
|
+
initializer_bias_prior_prob=None,
|
438
|
+
num_queries=num_queries,
|
439
|
+
name="decoder",
|
440
|
+
)
|
441
|
+
self.anchor_generator = DFineAnchorGenerator(
|
442
|
+
anchor_image_size=anchor_image_size,
|
443
|
+
feat_strides=feat_strides,
|
444
|
+
data_format=data_format,
|
445
|
+
dtype=dtype,
|
446
|
+
name="anchor_generator",
|
447
|
+
)
|
448
|
+
self.contrastive_denoising_group_generator = (
|
449
|
+
DFineContrastiveDenoisingGroupGenerator(
|
450
|
+
num_labels=num_labels,
|
451
|
+
num_denoising=num_denoising,
|
452
|
+
label_noise_ratio=label_noise_ratio,
|
453
|
+
box_noise_scale=box_noise_scale,
|
454
|
+
seed=seed,
|
455
|
+
dtype=dtype,
|
456
|
+
name="contrastive_denoising_group_generator",
|
457
|
+
)
|
458
|
+
)
|
459
|
+
if num_denoising > 0:
|
460
|
+
self.denoising_class_embed = keras.layers.Embedding(
|
461
|
+
input_dim=num_labels + 1,
|
462
|
+
output_dim=hidden_dim,
|
463
|
+
embeddings_initializer="glorot_uniform",
|
464
|
+
name="denoising_class_embed",
|
465
|
+
dtype=dtype,
|
466
|
+
)
|
467
|
+
self.denoising_class_embed.build(None)
|
468
|
+
else:
|
469
|
+
self.denoising_class_embed = None
|
470
|
+
|
471
|
+
self.source_flattener = DFineSourceFlattener(
|
472
|
+
dtype=dtype,
|
473
|
+
name="source_flattener",
|
474
|
+
channel_axis=channel_axis,
|
475
|
+
data_format=data_format,
|
476
|
+
)
|
477
|
+
self.initial_query_reference_generator = (
|
478
|
+
DFineInitialQueryAndReferenceGenerator(
|
479
|
+
num_queries=num_queries,
|
480
|
+
learn_initial_query=learn_initial_query,
|
481
|
+
hidden_dim=hidden_dim,
|
482
|
+
dtype=dtype,
|
483
|
+
name="initial_query_reference_generator",
|
484
|
+
)
|
485
|
+
)
|
486
|
+
self.spatial_shapes_extractor = DFineSpatialShapesExtractor(
|
487
|
+
dtype=dtype,
|
488
|
+
data_format=data_format,
|
489
|
+
name="spatial_shapes_extractor",
|
490
|
+
)
|
491
|
+
num_backbone_outs = len(decoder_in_channels)
|
492
|
+
self.encoder_input_proj_layers = []
|
493
|
+
for i in range(num_backbone_outs):
|
494
|
+
self.encoder_input_proj_layers.append(
|
495
|
+
[
|
496
|
+
keras.layers.Conv2D(
|
497
|
+
filters=encoder_hidden_dim,
|
498
|
+
kernel_size=1,
|
499
|
+
use_bias=False,
|
500
|
+
kernel_initializer=initializer,
|
501
|
+
bias_initializer="zeros",
|
502
|
+
data_format=data_format,
|
503
|
+
name=f"encoder_input_proj_conv_{i}",
|
504
|
+
dtype=dtype,
|
505
|
+
),
|
506
|
+
keras.layers.BatchNormalization(
|
507
|
+
epsilon=1e-5,
|
508
|
+
axis=channel_axis,
|
509
|
+
name=f"encoder_input_proj_bn_{i}",
|
510
|
+
dtype=dtype,
|
511
|
+
),
|
512
|
+
]
|
513
|
+
)
|
514
|
+
self.enc_output_layers = [
|
515
|
+
keras.layers.Dense(
|
516
|
+
hidden_dim,
|
517
|
+
name="enc_output_dense",
|
518
|
+
dtype=dtype,
|
519
|
+
),
|
520
|
+
keras.layers.LayerNormalization(
|
521
|
+
epsilon=1e-5,
|
522
|
+
name="enc_output_ln",
|
523
|
+
dtype=dtype,
|
524
|
+
),
|
525
|
+
]
|
526
|
+
prior_prob = 1 / (num_labels + 1)
|
527
|
+
enc_score_head_bias = float(-math.log((1 - prior_prob) / prior_prob))
|
528
|
+
self.enc_score_head = keras.layers.Dense(
|
529
|
+
num_labels,
|
530
|
+
name="enc_score_head",
|
531
|
+
dtype=dtype,
|
532
|
+
kernel_initializer="glorot_uniform",
|
533
|
+
bias_initializer=keras.initializers.Constant(enc_score_head_bias),
|
534
|
+
)
|
535
|
+
self.enc_bbox_head = DFineMLPPredictionHead(
|
536
|
+
input_dim=hidden_dim,
|
537
|
+
hidden_dim=hidden_dim,
|
538
|
+
output_dim=4,
|
539
|
+
num_layers=3,
|
540
|
+
name="enc_bbox_head",
|
541
|
+
dtype=dtype,
|
542
|
+
kernel_initializer=initializer,
|
543
|
+
last_layer_initializer="zeros",
|
544
|
+
)
|
545
|
+
self.decoder_input_proj_layers = []
|
546
|
+
for i in range(num_backbone_outs):
|
547
|
+
if hidden_dim == decoder_in_channels[-1]:
|
548
|
+
proj_layer = keras.layers.Identity(
|
549
|
+
name=f"decoder_input_proj_identity_{i}",
|
550
|
+
dtype=dtype,
|
551
|
+
)
|
552
|
+
self.decoder_input_proj_layers.append(proj_layer)
|
553
|
+
else:
|
554
|
+
self.decoder_input_proj_layers.append(
|
555
|
+
[
|
556
|
+
keras.layers.Conv2D(
|
557
|
+
filters=hidden_dim,
|
558
|
+
kernel_size=1,
|
559
|
+
use_bias=False,
|
560
|
+
kernel_initializer=initializer,
|
561
|
+
bias_initializer="zeros",
|
562
|
+
data_format=data_format,
|
563
|
+
name=f"decoder_input_proj_conv1_{i}",
|
564
|
+
dtype=dtype,
|
565
|
+
),
|
566
|
+
keras.layers.BatchNormalization(
|
567
|
+
epsilon=1e-5,
|
568
|
+
axis=channel_axis,
|
569
|
+
name=f"decoder_input_proj_bn1_{i}",
|
570
|
+
dtype=dtype,
|
571
|
+
),
|
572
|
+
]
|
573
|
+
)
|
574
|
+
for i in range(num_feature_levels - num_backbone_outs):
|
575
|
+
idx = num_backbone_outs + i
|
576
|
+
if hidden_dim == decoder_in_channels[-1]:
|
577
|
+
proj_layer = keras.layers.Identity(
|
578
|
+
name=f"decoder_input_proj_identity_{idx}",
|
579
|
+
dtype=dtype,
|
580
|
+
)
|
581
|
+
self.decoder_input_proj_layers.append(proj_layer)
|
582
|
+
else:
|
583
|
+
self.decoder_input_proj_layers.append(
|
584
|
+
[
|
585
|
+
keras.layers.Conv2D(
|
586
|
+
filters=hidden_dim,
|
587
|
+
kernel_size=3,
|
588
|
+
strides=2,
|
589
|
+
padding="same",
|
590
|
+
use_bias=False,
|
591
|
+
kernel_initializer=initializer,
|
592
|
+
bias_initializer="zeros",
|
593
|
+
data_format=data_format,
|
594
|
+
name=f"decoder_input_proj_conv3_{idx}",
|
595
|
+
dtype=dtype,
|
596
|
+
),
|
597
|
+
keras.layers.BatchNormalization(
|
598
|
+
epsilon=1e-5,
|
599
|
+
axis=channel_axis,
|
600
|
+
name=f"decoder_input_proj_bn3_{idx}",
|
601
|
+
dtype=dtype,
|
602
|
+
),
|
603
|
+
]
|
604
|
+
)
|
605
|
+
self.dn_split_point = None
|
606
|
+
|
607
|
+
# === Functional Model ===
|
608
|
+
pixel_values = keras.Input(
|
609
|
+
shape=image_shape, name="pixel_values", dtype="float32"
|
610
|
+
)
|
611
|
+
feature_maps_output = self.backbone(pixel_values)
|
612
|
+
feature_maps = [feature_maps_output[stage] for stage in out_features]
|
613
|
+
feature_maps_output_tuple = tuple(feature_maps)
|
614
|
+
proj_feats = []
|
615
|
+
for level, feature_map in enumerate(feature_maps_output_tuple):
|
616
|
+
x = self.encoder_input_proj_layers[level][0](feature_map)
|
617
|
+
x = self.encoder_input_proj_layers[level][1](x)
|
618
|
+
proj_feats.append(x)
|
619
|
+
encoder_outputs = self.encoder(
|
620
|
+
inputs_embeds=proj_feats,
|
621
|
+
output_hidden_states=True,
|
622
|
+
output_attentions=True,
|
623
|
+
)
|
624
|
+
encoder_last_hidden_state = encoder_outputs[0]
|
625
|
+
encoder_hidden_states = (
|
626
|
+
encoder_outputs[1] if len(encoder_outputs) > 1 else None
|
627
|
+
)
|
628
|
+
encoder_attentions = (
|
629
|
+
encoder_outputs[2] if len(encoder_outputs) > 2 else None
|
630
|
+
)
|
631
|
+
last_hidden_state = encoder_outputs[0]
|
632
|
+
sources = []
|
633
|
+
# NOTE: Handle both no-op (identity mapping) and an actual projection
|
634
|
+
# using Conv2D and BatchNorm with `isinstance(proj, list)`.
|
635
|
+
for level, source in enumerate(last_hidden_state):
|
636
|
+
proj = self.decoder_input_proj_layers[level]
|
637
|
+
if isinstance(proj, list):
|
638
|
+
x = proj[0](source)
|
639
|
+
x = proj[1](x)
|
640
|
+
sources.append(x)
|
641
|
+
else:
|
642
|
+
sources.append(proj(source))
|
643
|
+
if num_feature_levels > len(sources):
|
644
|
+
len_sources = len(sources)
|
645
|
+
proj = self.decoder_input_proj_layers[len_sources]
|
646
|
+
if isinstance(proj, list):
|
647
|
+
x = proj[0](last_hidden_state[-1])
|
648
|
+
x = proj[1](x)
|
649
|
+
sources.append(x)
|
650
|
+
else:
|
651
|
+
sources.append(proj(last_hidden_state[-1]))
|
652
|
+
for i in range(len_sources + 1, num_feature_levels):
|
653
|
+
proj = self.decoder_input_proj_layers[i]
|
654
|
+
if isinstance(proj, list):
|
655
|
+
x = proj[0](sources[-1])
|
656
|
+
x = proj[1](x)
|
657
|
+
sources.append(x)
|
658
|
+
else:
|
659
|
+
sources.append(proj(sources[-1]))
|
660
|
+
spatial_shapes_tensor = self.spatial_shapes_extractor(sources)
|
661
|
+
source_flatten = self.source_flattener(sources)
|
662
|
+
if num_denoising > 0 and labels is not None:
|
663
|
+
(
|
664
|
+
input_query_class,
|
665
|
+
denoising_bbox_unact,
|
666
|
+
attention_mask,
|
667
|
+
denoising_meta_values,
|
668
|
+
) = self.contrastive_denoising_group_generator(
|
669
|
+
targets=labels,
|
670
|
+
num_queries=num_queries,
|
671
|
+
)
|
672
|
+
self.dn_split_point = int(denoising_meta_values["dn_num_split"][0])
|
673
|
+
else:
|
674
|
+
(
|
675
|
+
denoising_class,
|
676
|
+
denoising_bbox_unact,
|
677
|
+
attention_mask,
|
678
|
+
denoising_meta_values,
|
679
|
+
) = None, None, None, None
|
680
|
+
|
681
|
+
if num_denoising > 0 and labels is not None:
|
682
|
+
denoising_processor = DFineDenoisingPreprocessorLayer(
|
683
|
+
name="denoising_processor", dtype=dtype
|
684
|
+
)
|
685
|
+
denoising_tensors = denoising_processor(
|
686
|
+
[
|
687
|
+
pixel_values,
|
688
|
+
input_query_class,
|
689
|
+
denoising_bbox_unact,
|
690
|
+
attention_mask,
|
691
|
+
],
|
692
|
+
denoising_meta_values=denoising_meta_values,
|
693
|
+
)
|
694
|
+
input_query_class_tensor = denoising_tensors["input_query_class"]
|
695
|
+
denoising_bbox_unact = denoising_tensors["denoising_bbox_unact"]
|
696
|
+
attention_mask = denoising_tensors["attention_mask"]
|
697
|
+
denoising_class = self.denoising_class_embed(
|
698
|
+
input_query_class_tensor
|
699
|
+
)
|
700
|
+
|
701
|
+
anchors, valid_mask = self.anchor_generator(sources)
|
702
|
+
memory = keras.ops.where(valid_mask, source_flatten, 0.0)
|
703
|
+
output_memory = self.enc_output_layers[0](memory)
|
704
|
+
output_memory = self.enc_output_layers[1](output_memory)
|
705
|
+
enc_outputs_class = self.enc_score_head(output_memory)
|
706
|
+
enc_outputs_coord_logits = self.enc_bbox_head(output_memory)
|
707
|
+
enc_outputs_coord_logits_plus_anchors = (
|
708
|
+
enc_outputs_coord_logits + anchors
|
709
|
+
)
|
710
|
+
init_reference_points, target, enc_topk_logits, enc_topk_bboxes = (
|
711
|
+
self.initial_query_reference_generator(
|
712
|
+
(
|
713
|
+
enc_outputs_class,
|
714
|
+
enc_outputs_coord_logits_plus_anchors,
|
715
|
+
output_memory,
|
716
|
+
sources[-1],
|
717
|
+
),
|
718
|
+
denoising_bbox_unact=denoising_bbox_unact,
|
719
|
+
denoising_class=denoising_class,
|
720
|
+
)
|
721
|
+
)
|
722
|
+
decoder_outputs = self.decoder(
|
723
|
+
inputs_embeds=target,
|
724
|
+
encoder_hidden_states=source_flatten,
|
725
|
+
reference_points=init_reference_points,
|
726
|
+
spatial_shapes=spatial_shapes_tensor,
|
727
|
+
attention_mask=attention_mask,
|
728
|
+
output_hidden_states=True,
|
729
|
+
output_attentions=True,
|
730
|
+
)
|
731
|
+
last_hidden_state = decoder_outputs[0]
|
732
|
+
intermediate_hidden_states = decoder_outputs[1]
|
733
|
+
intermediate_logits = decoder_outputs[2]
|
734
|
+
intermediate_reference_points = decoder_outputs[3]
|
735
|
+
intermediate_predicted_corners = decoder_outputs[4]
|
736
|
+
initial_reference_points = decoder_outputs[5]
|
737
|
+
decoder_hidden_states = (
|
738
|
+
decoder_outputs[6] if len(decoder_outputs) > 6 else None
|
739
|
+
)
|
740
|
+
decoder_attentions = (
|
741
|
+
decoder_outputs[7] if len(decoder_outputs) > 7 else None
|
742
|
+
)
|
743
|
+
cross_attentions = (
|
744
|
+
decoder_outputs[8] if len(decoder_outputs) > 8 else None
|
745
|
+
)
|
746
|
+
outputs = {
|
747
|
+
"last_hidden_state": last_hidden_state,
|
748
|
+
"intermediate_hidden_states": intermediate_hidden_states,
|
749
|
+
"intermediate_logits": intermediate_logits,
|
750
|
+
"intermediate_reference_points": intermediate_reference_points,
|
751
|
+
"intermediate_predicted_corners": intermediate_predicted_corners,
|
752
|
+
"initial_reference_points": initial_reference_points,
|
753
|
+
"decoder_hidden_states": decoder_hidden_states,
|
754
|
+
"decoder_attentions": decoder_attentions,
|
755
|
+
"cross_attentions": cross_attentions,
|
756
|
+
"encoder_last_hidden_state": encoder_last_hidden_state[0],
|
757
|
+
"encoder_hidden_states": encoder_hidden_states,
|
758
|
+
"encoder_attentions": encoder_attentions,
|
759
|
+
"init_reference_points": init_reference_points,
|
760
|
+
"enc_topk_logits": enc_topk_logits,
|
761
|
+
"enc_topk_bboxes": enc_topk_bboxes,
|
762
|
+
"enc_outputs_class": enc_outputs_class,
|
763
|
+
"enc_outputs_coord_logits": enc_outputs_coord_logits,
|
764
|
+
}
|
765
|
+
|
766
|
+
if num_denoising > 0 and labels is not None:
|
767
|
+
outputs["dn_positive_idx"] = denoising_tensors["dn_positive_idx"]
|
768
|
+
outputs["dn_num_group"] = denoising_tensors["dn_num_group"]
|
769
|
+
outputs["dn_num_split"] = denoising_tensors["dn_num_split"]
|
770
|
+
|
771
|
+
outputs = {k: v for k, v in outputs.items() if v is not None}
|
772
|
+
super().__init__(
|
773
|
+
inputs=pixel_values,
|
774
|
+
outputs=outputs,
|
775
|
+
dtype=dtype,
|
776
|
+
**kwargs,
|
777
|
+
)
|
778
|
+
|
779
|
+
# === Config ===
|
780
|
+
self.decoder_in_channels = decoder_in_channels
|
781
|
+
self.encoder_hidden_dim = encoder_hidden_dim
|
782
|
+
self.num_labels = num_labels
|
783
|
+
self.num_denoising = num_denoising
|
784
|
+
self.learn_initial_query = learn_initial_query
|
785
|
+
self.num_queries = num_queries
|
786
|
+
self.anchor_image_size = anchor_image_size
|
787
|
+
self.feat_strides = feat_strides
|
788
|
+
self.num_feature_levels = num_feature_levels
|
789
|
+
self.hidden_dim = hidden_dim
|
790
|
+
self.encoder_in_channels = encoder_in_channels
|
791
|
+
self.encode_proj_layers = encode_proj_layers
|
792
|
+
self.num_attention_heads = num_attention_heads
|
793
|
+
self.encoder_ffn_dim = encoder_ffn_dim
|
794
|
+
self.num_encoder_layers = num_encoder_layers
|
795
|
+
self.hidden_expansion = hidden_expansion
|
796
|
+
self.depth_multiplier = depth_multiplier
|
797
|
+
self.eval_idx = eval_idx
|
798
|
+
self.box_noise_scale = box_noise_scale
|
799
|
+
self.labels = labels
|
800
|
+
self.label_noise_ratio = label_noise_ratio
|
801
|
+
self.num_decoder_layers = num_decoder_layers
|
802
|
+
self.decoder_attention_heads = decoder_attention_heads
|
803
|
+
self.decoder_ffn_dim = decoder_ffn_dim
|
804
|
+
self.decoder_method = decoder_method
|
805
|
+
self.decoder_n_points = decoder_n_points
|
806
|
+
self.lqe_hidden_dim = lqe_hidden_dim
|
807
|
+
self.num_lqe_layers = num_lqe_layers
|
808
|
+
self.data_format = data_format
|
809
|
+
self.seed = seed
|
810
|
+
self.image_shape = image_shape
|
811
|
+
self.channel_axis = channel_axis
|
812
|
+
self.spatial_shapes = spatial_shapes
|
813
|
+
self.stage_names = stage_names
|
814
|
+
self.out_features = out_features
|
815
|
+
self.initializer = initializer
|
816
|
+
|
817
|
+
def get_config(self):
|
818
|
+
config = super().get_config()
|
819
|
+
serializable_labels = None
|
820
|
+
if self.labels is not None:
|
821
|
+
serializable_labels = []
|
822
|
+
for target in self.labels:
|
823
|
+
serializable_target = {}
|
824
|
+
for key, value in target.items():
|
825
|
+
if hasattr(value, "tolist"):
|
826
|
+
serializable_target[key] = value.tolist()
|
827
|
+
else:
|
828
|
+
serializable_target[key] = value
|
829
|
+
serializable_labels.append(serializable_target)
|
830
|
+
config.update(
|
831
|
+
{
|
832
|
+
"backbone": keras.layers.serialize(self.backbone),
|
833
|
+
"decoder_in_channels": self.decoder_in_channels,
|
834
|
+
"encoder_hidden_dim": self.encoder_hidden_dim,
|
835
|
+
"num_labels": self.num_labels,
|
836
|
+
"num_denoising": self.num_denoising,
|
837
|
+
"learn_initial_query": self.learn_initial_query,
|
838
|
+
"num_queries": self.num_queries,
|
839
|
+
"anchor_image_size": self.anchor_image_size,
|
840
|
+
"feat_strides": self.feat_strides,
|
841
|
+
"num_feature_levels": self.num_feature_levels,
|
842
|
+
"hidden_dim": self.hidden_dim,
|
843
|
+
"encoder_in_channels": self.encoder_in_channels,
|
844
|
+
"encode_proj_layers": self.encode_proj_layers,
|
845
|
+
"num_attention_heads": self.num_attention_heads,
|
846
|
+
"encoder_ffn_dim": self.encoder_ffn_dim,
|
847
|
+
"num_encoder_layers": self.num_encoder_layers,
|
848
|
+
"hidden_expansion": self.hidden_expansion,
|
849
|
+
"depth_multiplier": self.depth_multiplier,
|
850
|
+
"eval_idx": self.eval_idx,
|
851
|
+
"box_noise_scale": self.box_noise_scale,
|
852
|
+
"label_noise_ratio": self.label_noise_ratio,
|
853
|
+
"labels": serializable_labels,
|
854
|
+
"num_decoder_layers": self.num_decoder_layers,
|
855
|
+
"decoder_attention_heads": self.decoder_attention_heads,
|
856
|
+
"decoder_ffn_dim": self.decoder_ffn_dim,
|
857
|
+
"decoder_method": self.decoder_method,
|
858
|
+
"decoder_n_points": self.decoder_n_points,
|
859
|
+
"lqe_hidden_dim": self.lqe_hidden_dim,
|
860
|
+
"num_lqe_layers": self.num_lqe_layers,
|
861
|
+
"seed": self.seed,
|
862
|
+
"image_shape": self.image_shape,
|
863
|
+
"data_format": self.data_format,
|
864
|
+
"out_features": self.out_features,
|
865
|
+
}
|
866
|
+
)
|
867
|
+
return config
|
868
|
+
|
869
|
+
@classmethod
|
870
|
+
def from_config(cls, config, custom_objects=None):
|
871
|
+
config = config.copy()
|
872
|
+
if "labels" in config and config["labels"] is not None:
|
873
|
+
labels = config["labels"]
|
874
|
+
deserialized_labels = []
|
875
|
+
for target in labels:
|
876
|
+
deserialized_target = {}
|
877
|
+
for key, value in target.items():
|
878
|
+
if isinstance(value, list):
|
879
|
+
deserialized_target[key] = np.array(value)
|
880
|
+
else:
|
881
|
+
deserialized_target[key] = value
|
882
|
+
deserialized_labels.append(deserialized_target)
|
883
|
+
config["labels"] = deserialized_labels
|
884
|
+
if "dtype" in config and config["dtype"] is not None:
|
885
|
+
dtype_config = config["dtype"]
|
886
|
+
if "dtype" not in config["backbone"]["config"]:
|
887
|
+
config["backbone"]["config"]["dtype"] = dtype_config
|
888
|
+
config["backbone"] = keras.layers.deserialize(
|
889
|
+
config["backbone"], custom_objects=custom_objects
|
890
|
+
)
|
891
|
+
return cls(**config)
|