keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +6 -0
- keras_hub/models/__init__.py +21 -0
- keras_hub/src/layers/modeling/position_embedding.py +21 -6
- keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
- keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
- keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
- keras_hub/src/models/backbone.py +10 -15
- keras_hub/src/models/d_fine/__init__.py +0 -0
- keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
- keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
- keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
- keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
- keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
- keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
- keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
- keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
- keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
- keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
- keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
- keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
- keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
- keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
- keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
- keras_hub/src/models/parseq/__init__.py +0 -0
- keras_hub/src/models/parseq/parseq_backbone.py +134 -0
- keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
- keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
- keras_hub/src/models/parseq/parseq_decoder.py +418 -0
- keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
- keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
- keras_hub/src/tests/test_case.py +37 -1
- keras_hub/src/utils/preset_utils.py +49 -0
- keras_hub/src/utils/tensor_utils.py +23 -1
- keras_hub/src/utils/transformers/convert_vit.py +4 -1
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,365 @@
|
|
1
|
+
import keras
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
from keras_hub.src.models.d_fine.d_fine_attention import DFineMultiheadAttention
|
5
|
+
from keras_hub.src.utils.keras_utils import clone_initializer
|
6
|
+
|
7
|
+
|
8
|
+
class DFineEncoderLayer(keras.layers.Layer):
|
9
|
+
"""Single encoder layer for D-FINE models.
|
10
|
+
|
11
|
+
This layer is the fundamental building block of the `DFineEncoder`. It
|
12
|
+
implements a standard transformer encoder layer with multi-head
|
13
|
+
self-attention (`DFineMultiheadAttention`) and a feed-forward network. It is
|
14
|
+
used to process and refine the feature sequences from the CNN backbone.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
normalize_before: bool, Whether to apply layer normalization before
|
18
|
+
the attention and feed-forward sub-layers (pre-norm) or after
|
19
|
+
(post-norm).
|
20
|
+
encoder_hidden_dim: int, Hidden dimension size of the encoder.
|
21
|
+
num_attention_heads: int, Number of attention heads in multi-head
|
22
|
+
attention.
|
23
|
+
dropout: float, Dropout probability applied to attention outputs and
|
24
|
+
feed-forward outputs.
|
25
|
+
layer_norm_eps: float, Small constant added to the denominator for
|
26
|
+
numerical stability in layer normalization.
|
27
|
+
encoder_activation_function: str, Activation function used in the
|
28
|
+
feed-forward network.
|
29
|
+
activation_dropout: float, Dropout probability applied after the
|
30
|
+
activation function in the feed-forward network.
|
31
|
+
encoder_ffn_dim: int, Hidden dimension size of the feed-forward network.
|
32
|
+
**kwargs: Additional keyword arguments passed to the parent class.
|
33
|
+
kernel_initializer: str or Initializer, optional, Initializer for
|
34
|
+
the kernel weights. Defaults to `"glorot_uniform"`.
|
35
|
+
bias_initializer: str or Initializer, optional, Initializer for
|
36
|
+
the bias weights. Defaults to `"zeros"`.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
normalize_before,
|
42
|
+
encoder_hidden_dim,
|
43
|
+
num_attention_heads,
|
44
|
+
dropout,
|
45
|
+
layer_norm_eps,
|
46
|
+
encoder_activation_function,
|
47
|
+
activation_dropout,
|
48
|
+
encoder_ffn_dim,
|
49
|
+
kernel_initializer="glorot_uniform",
|
50
|
+
bias_initializer="zeros",
|
51
|
+
dtype=None,
|
52
|
+
**kwargs,
|
53
|
+
):
|
54
|
+
super().__init__(dtype=dtype, **kwargs)
|
55
|
+
self.normalize_before = normalize_before
|
56
|
+
self.encoder_hidden_dim = encoder_hidden_dim
|
57
|
+
self.num_attention_heads = num_attention_heads
|
58
|
+
self.dropout_rate = dropout
|
59
|
+
self.layer_norm_eps = layer_norm_eps
|
60
|
+
self.encoder_activation_function = encoder_activation_function
|
61
|
+
self.activation_dropout_rate = activation_dropout
|
62
|
+
self.encoder_ffn_dim = encoder_ffn_dim
|
63
|
+
self.kernel_initializer = keras.initializers.get(kernel_initializer)
|
64
|
+
self.bias_initializer = keras.initializers.get(bias_initializer)
|
65
|
+
self.self_attn = DFineMultiheadAttention(
|
66
|
+
embedding_dim=self.encoder_hidden_dim,
|
67
|
+
num_heads=self.num_attention_heads,
|
68
|
+
dropout=self.dropout_rate,
|
69
|
+
dtype=self.dtype_policy,
|
70
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
71
|
+
bias_initializer=clone_initializer(self.bias_initializer),
|
72
|
+
name="self_attn",
|
73
|
+
)
|
74
|
+
self.self_attn_layer_norm = keras.layers.LayerNormalization(
|
75
|
+
epsilon=self.layer_norm_eps,
|
76
|
+
name="self_attn_layer_norm",
|
77
|
+
dtype=self.dtype_policy,
|
78
|
+
)
|
79
|
+
self.dropout_layer = keras.layers.Dropout(
|
80
|
+
rate=self.dropout_rate,
|
81
|
+
name="dropout_layer",
|
82
|
+
dtype=self.dtype_policy,
|
83
|
+
)
|
84
|
+
self.activation_fn_layer = keras.layers.Activation(
|
85
|
+
self.encoder_activation_function,
|
86
|
+
name="activation_fn_layer",
|
87
|
+
dtype=self.dtype_policy,
|
88
|
+
)
|
89
|
+
self.activation_dropout_layer = keras.layers.Dropout(
|
90
|
+
rate=self.activation_dropout_rate,
|
91
|
+
name="activation_dropout_layer",
|
92
|
+
dtype=self.dtype_policy,
|
93
|
+
)
|
94
|
+
self.fc1 = keras.layers.Dense(
|
95
|
+
self.encoder_ffn_dim,
|
96
|
+
name="fc1",
|
97
|
+
dtype=self.dtype_policy,
|
98
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
99
|
+
bias_initializer=clone_initializer(self.bias_initializer),
|
100
|
+
)
|
101
|
+
self.fc2 = keras.layers.Dense(
|
102
|
+
self.encoder_hidden_dim,
|
103
|
+
name="fc2",
|
104
|
+
dtype=self.dtype_policy,
|
105
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
106
|
+
bias_initializer=clone_initializer(self.bias_initializer),
|
107
|
+
)
|
108
|
+
self.final_layer_norm = keras.layers.LayerNormalization(
|
109
|
+
epsilon=self.layer_norm_eps,
|
110
|
+
name="final_layer_norm",
|
111
|
+
dtype=self.dtype_policy,
|
112
|
+
)
|
113
|
+
|
114
|
+
def build(self, input_shape):
|
115
|
+
self.self_attn.build(input_shape)
|
116
|
+
self.self_attn_layer_norm.build(input_shape)
|
117
|
+
self.fc1.build(input_shape)
|
118
|
+
self.fc2.build((input_shape[0], input_shape[1], self.encoder_ffn_dim))
|
119
|
+
self.final_layer_norm.build(input_shape)
|
120
|
+
super().build(input_shape)
|
121
|
+
|
122
|
+
def call(
|
123
|
+
self,
|
124
|
+
hidden_states,
|
125
|
+
attention_mask=None,
|
126
|
+
position_embeddings=None,
|
127
|
+
output_attentions=False,
|
128
|
+
training=None,
|
129
|
+
):
|
130
|
+
residual = hidden_states
|
131
|
+
if self.normalize_before:
|
132
|
+
hidden_states = self.self_attn_layer_norm(
|
133
|
+
hidden_states, training=training
|
134
|
+
)
|
135
|
+
hidden_states, attn_weights = self.self_attn(
|
136
|
+
hidden_states=hidden_states,
|
137
|
+
attention_mask=attention_mask,
|
138
|
+
position_embeddings=position_embeddings,
|
139
|
+
output_attentions=output_attentions,
|
140
|
+
training=training,
|
141
|
+
)
|
142
|
+
hidden_states = self.dropout_layer(hidden_states, training=training)
|
143
|
+
hidden_states = residual + hidden_states
|
144
|
+
if not self.normalize_before:
|
145
|
+
hidden_states = self.self_attn_layer_norm(
|
146
|
+
hidden_states, training=training
|
147
|
+
)
|
148
|
+
if self.normalize_before:
|
149
|
+
hidden_states = self.final_layer_norm(
|
150
|
+
hidden_states, training=training
|
151
|
+
)
|
152
|
+
residual_ffn = hidden_states
|
153
|
+
hidden_states = self.fc1(hidden_states)
|
154
|
+
hidden_states = self.activation_fn_layer(hidden_states)
|
155
|
+
hidden_states = self.activation_dropout_layer(
|
156
|
+
hidden_states, training=training
|
157
|
+
)
|
158
|
+
hidden_states = self.fc2(hidden_states)
|
159
|
+
hidden_states = self.dropout_layer(hidden_states, training=training)
|
160
|
+
hidden_states = residual_ffn + hidden_states
|
161
|
+
if not self.normalize_before:
|
162
|
+
hidden_states = self.final_layer_norm(
|
163
|
+
hidden_states, training=training
|
164
|
+
)
|
165
|
+
if training:
|
166
|
+
dtype_name = keras.backend.standardize_dtype(self.compute_dtype)
|
167
|
+
if dtype_name == "float16":
|
168
|
+
clamp_value = np.finfo(np.float16).max - 1000.0
|
169
|
+
else: # float32, bfloat16
|
170
|
+
clamp_value = np.finfo(np.float32).max - 1000.0
|
171
|
+
hidden_states = keras.ops.clip(
|
172
|
+
hidden_states, x_min=-clamp_value, x_max=clamp_value
|
173
|
+
)
|
174
|
+
if output_attentions:
|
175
|
+
return hidden_states, attn_weights
|
176
|
+
return hidden_states
|
177
|
+
|
178
|
+
def compute_output_spec(
|
179
|
+
self,
|
180
|
+
hidden_states,
|
181
|
+
attention_mask=None,
|
182
|
+
position_embeddings=None,
|
183
|
+
output_attentions=False,
|
184
|
+
training=None,
|
185
|
+
):
|
186
|
+
attn_output_spec = self.self_attn.compute_output_spec(
|
187
|
+
hidden_states,
|
188
|
+
position_embeddings,
|
189
|
+
attention_mask,
|
190
|
+
output_attentions,
|
191
|
+
)
|
192
|
+
if output_attentions:
|
193
|
+
hidden_states_output_spec, self_attn_weights_spec = attn_output_spec
|
194
|
+
return hidden_states_output_spec, self_attn_weights_spec
|
195
|
+
return attn_output_spec
|
196
|
+
|
197
|
+
def get_config(self):
|
198
|
+
config = super().get_config()
|
199
|
+
config.update(
|
200
|
+
{
|
201
|
+
"normalize_before": self.normalize_before,
|
202
|
+
"encoder_hidden_dim": self.encoder_hidden_dim,
|
203
|
+
"num_attention_heads": self.num_attention_heads,
|
204
|
+
"dropout": self.dropout_rate,
|
205
|
+
"layer_norm_eps": self.layer_norm_eps,
|
206
|
+
"encoder_activation_function": self.encoder_activation_function,
|
207
|
+
"activation_dropout": self.activation_dropout_rate,
|
208
|
+
"encoder_ffn_dim": self.encoder_ffn_dim,
|
209
|
+
"kernel_initializer": keras.initializers.serialize(
|
210
|
+
self.kernel_initializer
|
211
|
+
),
|
212
|
+
"bias_initializer": keras.initializers.serialize(
|
213
|
+
self.bias_initializer
|
214
|
+
),
|
215
|
+
}
|
216
|
+
)
|
217
|
+
return config
|
218
|
+
|
219
|
+
|
220
|
+
class DFineEncoder(keras.layers.Layer):
|
221
|
+
"""Multi-layer encoder for D-FINE models.
|
222
|
+
|
223
|
+
This layer implements a stack of `DFineEncoderLayer` instances. It is used
|
224
|
+
within the `DFineHybridEncoder` to apply transformer-based processing to
|
225
|
+
the feature maps from the CNN backbone, creating rich contextual
|
226
|
+
representations before they are passed to the FPN/PAN pathways.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
normalize_before: bool, Whether to apply layer normalization before
|
230
|
+
the attention and feed-forward sub-layers (pre-norm) or after
|
231
|
+
(post-norm) in each encoder layer.
|
232
|
+
encoder_hidden_dim: int, Hidden dimension size of the encoder layers.
|
233
|
+
num_attention_heads: int, Number of attention heads in multi-head
|
234
|
+
attention for each layer.
|
235
|
+
dropout: float, Dropout probability applied to attention outputs and
|
236
|
+
feed-forward outputs in each layer.
|
237
|
+
layer_norm_eps: float, Small constant added to the denominator for
|
238
|
+
numerical stability in layer normalization.
|
239
|
+
encoder_activation_function: str, Activation function used in the
|
240
|
+
feed-forward networks of each layer.
|
241
|
+
activation_dropout: float, Dropout probability applied after the
|
242
|
+
activation function in the feed-forward networks.
|
243
|
+
encoder_ffn_dim: int, Hidden dimension size of the feed-forward
|
244
|
+
networks in each layer.
|
245
|
+
num_encoder_layers: int, Number of encoder layers in the stack.
|
246
|
+
kernel_initializer: str or Initializer, optional, Initializer for
|
247
|
+
the kernel weights of each layer. Defaults to
|
248
|
+
`"glorot_uniform"`.
|
249
|
+
bias_initializer: str or Initializer, optional, Initializer for
|
250
|
+
the bias weights of each layer. Defaults to
|
251
|
+
`"zeros"`.
|
252
|
+
**kwargs: Additional keyword arguments passed to the parent class.
|
253
|
+
"""
|
254
|
+
|
255
|
+
def __init__(
|
256
|
+
self,
|
257
|
+
normalize_before,
|
258
|
+
encoder_hidden_dim,
|
259
|
+
num_attention_heads,
|
260
|
+
dropout,
|
261
|
+
layer_norm_eps,
|
262
|
+
encoder_activation_function,
|
263
|
+
activation_dropout,
|
264
|
+
encoder_ffn_dim,
|
265
|
+
num_encoder_layers,
|
266
|
+
kernel_initializer="glorot_uniform",
|
267
|
+
bias_initializer="zeros",
|
268
|
+
dtype=None,
|
269
|
+
**kwargs,
|
270
|
+
):
|
271
|
+
super().__init__(dtype=dtype, **kwargs)
|
272
|
+
self.normalize_before = normalize_before
|
273
|
+
self.encoder_hidden_dim = encoder_hidden_dim
|
274
|
+
self.num_attention_heads = num_attention_heads
|
275
|
+
self.dropout_rate = dropout
|
276
|
+
self.layer_norm_eps = layer_norm_eps
|
277
|
+
self.encoder_activation_function = encoder_activation_function
|
278
|
+
self.activation_dropout_rate = activation_dropout
|
279
|
+
self.encoder_ffn_dim = encoder_ffn_dim
|
280
|
+
self.num_encoder_layers = num_encoder_layers
|
281
|
+
self.kernel_initializer = kernel_initializer
|
282
|
+
self.bias_initializer = bias_initializer
|
283
|
+
self.encoder_layer = []
|
284
|
+
for i in range(self.num_encoder_layers):
|
285
|
+
layer = DFineEncoderLayer(
|
286
|
+
normalize_before=self.normalize_before,
|
287
|
+
encoder_hidden_dim=self.encoder_hidden_dim,
|
288
|
+
num_attention_heads=self.num_attention_heads,
|
289
|
+
dropout=self.dropout_rate,
|
290
|
+
layer_norm_eps=self.layer_norm_eps,
|
291
|
+
encoder_activation_function=self.encoder_activation_function,
|
292
|
+
activation_dropout=self.activation_dropout_rate,
|
293
|
+
encoder_ffn_dim=self.encoder_ffn_dim,
|
294
|
+
kernel_initializer=self.kernel_initializer,
|
295
|
+
bias_initializer=self.bias_initializer,
|
296
|
+
dtype=self.dtype_policy,
|
297
|
+
name=f"encoder_layer_{i}",
|
298
|
+
)
|
299
|
+
self.encoder_layer.append(layer)
|
300
|
+
|
301
|
+
def build(self, input_shape):
|
302
|
+
current_input_shape_for_layer = input_shape
|
303
|
+
for encoder_layer_instance in self.encoder_layer:
|
304
|
+
encoder_layer_instance.build(current_input_shape_for_layer)
|
305
|
+
super().build(input_shape)
|
306
|
+
|
307
|
+
def compute_output_spec(
|
308
|
+
self, src, src_mask=None, pos_embed=None, output_attentions=False
|
309
|
+
):
|
310
|
+
if not self.encoder_layer:
|
311
|
+
if output_attentions:
|
312
|
+
return src, None
|
313
|
+
return src
|
314
|
+
encoder_layer_output_spec = self.encoder_layer[0].compute_output_spec(
|
315
|
+
hidden_states=src,
|
316
|
+
attention_mask=src_mask,
|
317
|
+
position_embeddings=pos_embed,
|
318
|
+
output_attentions=output_attentions,
|
319
|
+
)
|
320
|
+
if output_attentions:
|
321
|
+
return encoder_layer_output_spec
|
322
|
+
return encoder_layer_output_spec
|
323
|
+
|
324
|
+
def call(
|
325
|
+
self,
|
326
|
+
src,
|
327
|
+
src_mask=None,
|
328
|
+
pos_embed=None,
|
329
|
+
output_attentions=False,
|
330
|
+
training=None,
|
331
|
+
):
|
332
|
+
current_hidden_tensor = src
|
333
|
+
last_layer_attn_weights = None
|
334
|
+
|
335
|
+
for encoder_layer_instance in self.encoder_layer:
|
336
|
+
current_hidden_tensor, layer_attn_weights = encoder_layer_instance(
|
337
|
+
hidden_states=current_hidden_tensor,
|
338
|
+
attention_mask=src_mask,
|
339
|
+
position_embeddings=pos_embed,
|
340
|
+
output_attentions=output_attentions,
|
341
|
+
training=training,
|
342
|
+
)
|
343
|
+
if output_attentions:
|
344
|
+
last_layer_attn_weights = layer_attn_weights
|
345
|
+
|
346
|
+
return current_hidden_tensor, last_layer_attn_weights
|
347
|
+
|
348
|
+
def get_config(self):
|
349
|
+
config = super().get_config()
|
350
|
+
config.update(
|
351
|
+
{
|
352
|
+
"normalize_before": self.normalize_before,
|
353
|
+
"encoder_hidden_dim": self.encoder_hidden_dim,
|
354
|
+
"num_attention_heads": self.num_attention_heads,
|
355
|
+
"dropout": self.dropout_rate,
|
356
|
+
"layer_norm_eps": self.layer_norm_eps,
|
357
|
+
"encoder_activation_function": self.encoder_activation_function,
|
358
|
+
"activation_dropout": self.activation_dropout_rate,
|
359
|
+
"encoder_ffn_dim": self.encoder_ffn_dim,
|
360
|
+
"num_encoder_layers": self.num_encoder_layers,
|
361
|
+
"kernel_initializer": self.kernel_initializer,
|
362
|
+
"bias_initializer": self.bias_initializer,
|
363
|
+
}
|
364
|
+
)
|
365
|
+
return config
|