keras-hub-nightly 0.22.0.dev202505290412__py3-none-any.whl → 0.22.0.dev202505310408__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +3 -0
- keras_hub/models/__init__.py +16 -0
- keras_hub/src/models/deit/__init__.py +0 -0
- keras_hub/src/models/deit/deit_backbone.py +154 -0
- keras_hub/src/models/deit/deit_image_classifier.py +171 -0
- keras_hub/src/models/deit/deit_image_classifier_preprocessor.py +12 -0
- keras_hub/src/models/deit/deit_image_converter.py +8 -0
- keras_hub/src/models/deit/deit_layers.py +519 -0
- keras_hub/src/models/deit/deit_presets.py +49 -0
- keras_hub/src/models/mixtral/mixtral_presets.py +4 -4
- keras_hub/src/models/qwen/qwen_presets.py +6 -6
- keras_hub/src/models/qwen3/qwen3_attention.py +369 -0
- keras_hub/src/models/qwen3/qwen3_backbone.py +191 -0
- keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py +10 -0
- keras_hub/src/models/qwen3/qwen3_decoder.py +309 -0
- keras_hub/src/models/qwen3/qwen3_layernorm.py +38 -0
- keras_hub/src/models/qwen3/qwen3_tokenizer.py +48 -0
- keras_hub/src/models/qwen_moe/qwen_moe_presets.py +2 -2
- keras_hub/src/utils/transformers/convert_deit.py +155 -0
- keras_hub/src/utils/transformers/convert_qwen3.py +145 -0
- keras_hub/src/utils/transformers/preset_loader.py +7 -1
- keras_hub/src/version.py +1 -1
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/RECORD +26 -11
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,519 @@
|
|
1
|
+
import keras
|
2
|
+
from keras import ops
|
3
|
+
|
4
|
+
from keras_hub.src.utils.keras_utils import standardize_data_format
|
5
|
+
|
6
|
+
|
7
|
+
class DeiTEmbeddings(keras.layers.Layer):
|
8
|
+
"""Patches the image and embeds the patches.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
image_size: tuple. Size of the input image (height, width).
|
12
|
+
patch_size: tuple. patch_size: tuple. The size of each image
|
13
|
+
patch as (patch_height, patch_width).
|
14
|
+
hidden_dim: int. Dimensionality of the patch embeddings.
|
15
|
+
num_channels: int. Number of channels in the input image. Defaults to
|
16
|
+
`3`.
|
17
|
+
data_format: str. `"channels_last"` or `"channels_first"`. Defaults to
|
18
|
+
`None` (which uses `"channels_last"`).
|
19
|
+
use_mask_token: bool. Whether to use a mask token. Defaults to `False`.
|
20
|
+
dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
|
21
|
+
`0.0`.
|
22
|
+
**kwargs: Additional keyword arguments passed to `keras.layers.Layer`
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
image_size,
|
28
|
+
patch_size,
|
29
|
+
hidden_dim,
|
30
|
+
num_channels=3,
|
31
|
+
data_format=None,
|
32
|
+
use_mask_token=False,
|
33
|
+
dropout_rate=0.0,
|
34
|
+
**kwargs,
|
35
|
+
):
|
36
|
+
super().__init__(**kwargs)
|
37
|
+
num_patches = (image_size[0] // patch_size[0]) * (
|
38
|
+
image_size[1] // patch_size[1]
|
39
|
+
)
|
40
|
+
num_positions = num_patches + 2
|
41
|
+
|
42
|
+
# === Config ===
|
43
|
+
self.image_size = image_size
|
44
|
+
self.patch_size = patch_size
|
45
|
+
self.hidden_dim = hidden_dim
|
46
|
+
self.num_channels = num_channels
|
47
|
+
self.num_patches = num_patches
|
48
|
+
self.num_positions = num_positions
|
49
|
+
self.data_format = standardize_data_format(data_format)
|
50
|
+
self.use_mask_token = use_mask_token
|
51
|
+
self.dropout_rate = dropout_rate
|
52
|
+
|
53
|
+
def build(self, input_shape):
|
54
|
+
if self.use_mask_token:
|
55
|
+
self.mask_token = self.add_weight(
|
56
|
+
shape=(1, 1, self.hidden_dim),
|
57
|
+
initializer="zeros",
|
58
|
+
dtype=self.variable_dtype,
|
59
|
+
name="mask_token",
|
60
|
+
)
|
61
|
+
self.class_token = self.add_weight(
|
62
|
+
shape=(
|
63
|
+
1,
|
64
|
+
1,
|
65
|
+
self.hidden_dim,
|
66
|
+
),
|
67
|
+
initializer="zeros",
|
68
|
+
dtype=self.variable_dtype,
|
69
|
+
name="class_token",
|
70
|
+
)
|
71
|
+
self.distillation_token = self.add_weight(
|
72
|
+
shape=(
|
73
|
+
1,
|
74
|
+
1,
|
75
|
+
self.hidden_dim,
|
76
|
+
),
|
77
|
+
initializer="zeros",
|
78
|
+
dtype=self.variable_dtype,
|
79
|
+
name="distillation_token",
|
80
|
+
)
|
81
|
+
self.patch_embedding = keras.layers.Conv2D(
|
82
|
+
filters=self.hidden_dim,
|
83
|
+
kernel_size=self.patch_size,
|
84
|
+
strides=self.patch_size,
|
85
|
+
padding="valid",
|
86
|
+
activation=None,
|
87
|
+
dtype=self.dtype_policy,
|
88
|
+
data_format=self.data_format,
|
89
|
+
name="patch_embedding",
|
90
|
+
)
|
91
|
+
self.patch_embedding.build(input_shape)
|
92
|
+
self.position_embedding = self.add_weight(
|
93
|
+
shape=(
|
94
|
+
1,
|
95
|
+
self.num_positions,
|
96
|
+
self.hidden_dim,
|
97
|
+
), # Matches the shape in PyTorch
|
98
|
+
initializer=keras.initializers.RandomNormal(
|
99
|
+
stddev=0.02
|
100
|
+
), # Equivalent to torch.randn()
|
101
|
+
dtype=self.variable_dtype,
|
102
|
+
trainable=True,
|
103
|
+
name="position_embedding",
|
104
|
+
)
|
105
|
+
self.dropout = keras.layers.Dropout(
|
106
|
+
self.dropout_rate, dtype=self.dtype_policy, name="dropout"
|
107
|
+
)
|
108
|
+
|
109
|
+
self.built = True
|
110
|
+
|
111
|
+
def call(self, inputs, bool_masked_pos=None):
|
112
|
+
patch_embeddings = self.patch_embedding(inputs)
|
113
|
+
if self.data_format == "channels_first":
|
114
|
+
patch_embeddings = ops.transpose(
|
115
|
+
patch_embeddings, axes=(0, 2, 3, 1)
|
116
|
+
)
|
117
|
+
embeddings_shape = ops.shape(patch_embeddings)
|
118
|
+
patch_embeddings = ops.reshape(
|
119
|
+
patch_embeddings, [embeddings_shape[0], -1, embeddings_shape[-1]]
|
120
|
+
)
|
121
|
+
|
122
|
+
if bool_masked_pos is not None and self.use_mask_token:
|
123
|
+
# Expand dimensions to match the embeddings
|
124
|
+
bool_masked_pos_expanded = ops.expand_dims(
|
125
|
+
bool_masked_pos, axis=-1
|
126
|
+
) # (batch_size, num_patches, 1)
|
127
|
+
mask_token_expanded = ops.expand_dims(
|
128
|
+
self.mask_token, axis=0
|
129
|
+
) # (1, 1, hidden_size)
|
130
|
+
# Apply masking
|
131
|
+
embeddings = ops.where(
|
132
|
+
bool_masked_pos_expanded, mask_token_expanded, patch_embeddings
|
133
|
+
)
|
134
|
+
|
135
|
+
class_token = ops.tile(self.class_token, (embeddings_shape[0], 1, 1))
|
136
|
+
distillation_token = ops.tile(
|
137
|
+
self.distillation_token, (embeddings_shape[0], 1, 1)
|
138
|
+
)
|
139
|
+
embeddings = ops.concatenate(
|
140
|
+
[class_token, distillation_token, patch_embeddings], axis=1
|
141
|
+
)
|
142
|
+
position_embedding = self.position_embedding
|
143
|
+
embeddings = ops.add(embeddings, position_embedding)
|
144
|
+
embeddings = self.dropout(embeddings)
|
145
|
+
return embeddings
|
146
|
+
|
147
|
+
def compute_output_shape(self, input_shape):
|
148
|
+
return (
|
149
|
+
input_shape[0],
|
150
|
+
self.num_positions,
|
151
|
+
self.hidden_dim,
|
152
|
+
)
|
153
|
+
|
154
|
+
def get_config(self):
|
155
|
+
config = super().get_config()
|
156
|
+
config.update(
|
157
|
+
{
|
158
|
+
"image_size": self.image_size,
|
159
|
+
"patch_size": self.patch_size,
|
160
|
+
"hidden_dim": self.hidden_dim,
|
161
|
+
"num_channels": self.num_channels,
|
162
|
+
"num_patches": self.num_patches,
|
163
|
+
"num_positions": self.num_positions,
|
164
|
+
"use_mask_token": self.use_mask_token,
|
165
|
+
"dropout_rate": self.dropout_rate,
|
166
|
+
}
|
167
|
+
)
|
168
|
+
return config
|
169
|
+
|
170
|
+
|
171
|
+
class DeiTIntermediate(keras.layers.Layer):
|
172
|
+
"""DeiTIntermediate block.
|
173
|
+
Args:
|
174
|
+
intermediate_dim: int. Dimensionality of the intermediate MLP layer.
|
175
|
+
**kwargs: Additional keyword arguments passed to `keras.layers.Layer`
|
176
|
+
"""
|
177
|
+
|
178
|
+
def __init__(
|
179
|
+
self,
|
180
|
+
intermediate_dim,
|
181
|
+
**kwargs,
|
182
|
+
):
|
183
|
+
super().__init__(**kwargs)
|
184
|
+
|
185
|
+
# === Config ===
|
186
|
+
self.intermediate_dim = intermediate_dim
|
187
|
+
|
188
|
+
def build(self, input_shape):
|
189
|
+
self.dense = keras.layers.Dense(
|
190
|
+
units=self.intermediate_dim,
|
191
|
+
activation="gelu",
|
192
|
+
dtype=self.dtype_policy,
|
193
|
+
name="dense",
|
194
|
+
)
|
195
|
+
self.dense.build(input_shape)
|
196
|
+
self.built = True
|
197
|
+
|
198
|
+
def call(self, inputs):
|
199
|
+
out = self.dense(inputs)
|
200
|
+
return out
|
201
|
+
|
202
|
+
def get_config(self):
|
203
|
+
config = super().get_config()
|
204
|
+
config.update(
|
205
|
+
{
|
206
|
+
"intermediate_dim": self.intermediate_dim,
|
207
|
+
}
|
208
|
+
)
|
209
|
+
return config
|
210
|
+
|
211
|
+
|
212
|
+
class DeiTOutput(keras.layers.Layer):
|
213
|
+
"""DeiT Output layer implementation.
|
214
|
+
Args:
|
215
|
+
hidden_dim: int. Dimensionality of the patch embeddings.
|
216
|
+
dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
|
217
|
+
`0.0`.
|
218
|
+
**kwargs: Additional keyword arguments passed to `keras.layers.Layer`
|
219
|
+
"""
|
220
|
+
|
221
|
+
def __init__(self, hidden_dim, dropout_rate=0.1, **kwargs):
|
222
|
+
super().__init__(**kwargs)
|
223
|
+
self.hidden_dim = hidden_dim
|
224
|
+
self.dropout_rate = dropout_rate
|
225
|
+
|
226
|
+
def build(self, input_shape):
|
227
|
+
self.dense = keras.layers.Dense(
|
228
|
+
self.hidden_dim, dtype=self.dtype_policy, name="output"
|
229
|
+
)
|
230
|
+
self.dense.build(input_shape)
|
231
|
+
|
232
|
+
self.dropout = keras.layers.Dropout(
|
233
|
+
self.dropout_rate, dtype=self.dtype_policy, name="dropout"
|
234
|
+
)
|
235
|
+
# Mark this layer as built
|
236
|
+
self.built = True
|
237
|
+
|
238
|
+
def call(self, hidden_states, input_tensor):
|
239
|
+
hidden_states = self.dense(hidden_states) # Linear transformation
|
240
|
+
hidden_states = self.dropout(hidden_states) # Apply dropout
|
241
|
+
hidden_states = hidden_states + input_tensor # Residual connection
|
242
|
+
return hidden_states
|
243
|
+
|
244
|
+
def get_config(self):
|
245
|
+
config = super().get_config()
|
246
|
+
config.update(
|
247
|
+
{
|
248
|
+
"hidden_dim": self.hidden_dim,
|
249
|
+
"dropout_rate": self.dropout_rate,
|
250
|
+
}
|
251
|
+
)
|
252
|
+
return config
|
253
|
+
|
254
|
+
|
255
|
+
class DeiTEncoderBlock(keras.layers.Layer):
|
256
|
+
"""DeiT encoder block.
|
257
|
+
Args:
|
258
|
+
num_heads: int. Number of attention heads.
|
259
|
+
hidden_dim: int. Dimensionality of the hidden representations.
|
260
|
+
intermediate_dim: int. Dimensionality of the intermediate MLP layer.
|
261
|
+
use_mha_bias: bool. Whether to use bias in the multi-head attention
|
262
|
+
layer. Defaults to `True`.
|
263
|
+
dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
|
264
|
+
`0.0`.
|
265
|
+
attention_dropout: float. Dropout rate for the attention mechanism.
|
266
|
+
Between 0 and 1. Defaults to `0.0`.
|
267
|
+
layer_norm_epsilon: float. Small float value for layer normalization
|
268
|
+
stability. Defaults to `1e-6`.
|
269
|
+
**kwargs: Additional keyword arguments passed to `keras.layers.Layer`
|
270
|
+
"""
|
271
|
+
|
272
|
+
def __init__(
|
273
|
+
self,
|
274
|
+
num_heads,
|
275
|
+
hidden_dim,
|
276
|
+
intermediate_dim,
|
277
|
+
use_mha_bias=True,
|
278
|
+
dropout_rate=0.0,
|
279
|
+
attention_dropout=0.0,
|
280
|
+
layer_norm_epsilon=1e-6,
|
281
|
+
**kwargs,
|
282
|
+
):
|
283
|
+
super().__init__(**kwargs)
|
284
|
+
key_dim = hidden_dim // num_heads
|
285
|
+
|
286
|
+
# === Config ===
|
287
|
+
self.num_heads = num_heads
|
288
|
+
self.hidden_dim = hidden_dim
|
289
|
+
self.intermediate_dim = intermediate_dim
|
290
|
+
self.key_dim = key_dim
|
291
|
+
self.use_mha_bias = use_mha_bias
|
292
|
+
self.dropout_rate = dropout_rate
|
293
|
+
self.attention_dropout = attention_dropout
|
294
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
295
|
+
|
296
|
+
def build(self, input_shape):
|
297
|
+
# Attention block
|
298
|
+
self.layer_norm_1 = keras.layers.LayerNormalization(
|
299
|
+
epsilon=self.layer_norm_epsilon,
|
300
|
+
name="ln_1",
|
301
|
+
dtype=self.dtype_policy,
|
302
|
+
)
|
303
|
+
self.layer_norm_1.build(input_shape)
|
304
|
+
self.mha = keras.layers.MultiHeadAttention(
|
305
|
+
num_heads=self.num_heads,
|
306
|
+
key_dim=self.key_dim,
|
307
|
+
use_bias=self.use_mha_bias,
|
308
|
+
dropout=self.attention_dropout,
|
309
|
+
name="mha",
|
310
|
+
dtype=self.dtype_policy,
|
311
|
+
)
|
312
|
+
self.mha.build(input_shape, input_shape)
|
313
|
+
|
314
|
+
# MLP block
|
315
|
+
self.layer_norm_2 = keras.layers.LayerNormalization(
|
316
|
+
epsilon=self.layer_norm_epsilon,
|
317
|
+
name="ln_2",
|
318
|
+
dtype=self.dtype_policy,
|
319
|
+
)
|
320
|
+
self.layer_norm_2.build((None, None, self.hidden_dim))
|
321
|
+
|
322
|
+
# Intermediate Layer
|
323
|
+
self.mlp = DeiTIntermediate(
|
324
|
+
self.intermediate_dim, dtype=self.dtype_policy, name="mlp"
|
325
|
+
)
|
326
|
+
self.mlp.build((None, None, self.hidden_dim))
|
327
|
+
|
328
|
+
# Output Layer
|
329
|
+
self.output_layer = DeiTOutput(
|
330
|
+
self.hidden_dim,
|
331
|
+
self.dropout_rate,
|
332
|
+
dtype=self.dtype_policy,
|
333
|
+
name="output_layer",
|
334
|
+
)
|
335
|
+
|
336
|
+
self.output_layer.build((None, None, self.intermediate_dim))
|
337
|
+
|
338
|
+
self.built = True
|
339
|
+
|
340
|
+
def call(
|
341
|
+
self,
|
342
|
+
hidden_states,
|
343
|
+
attention_mask=None,
|
344
|
+
return_attention_scores=False,
|
345
|
+
):
|
346
|
+
attention_scores = None
|
347
|
+
x = self.layer_norm_1(hidden_states)
|
348
|
+
if return_attention_scores:
|
349
|
+
x, attention_scores = self.mha(
|
350
|
+
x,
|
351
|
+
x,
|
352
|
+
attention_mask=attention_mask,
|
353
|
+
return_attention_scores=return_attention_scores,
|
354
|
+
)
|
355
|
+
else:
|
356
|
+
x = self.mha(
|
357
|
+
x,
|
358
|
+
x,
|
359
|
+
attention_mask=attention_mask,
|
360
|
+
)
|
361
|
+
|
362
|
+
x = x + hidden_states
|
363
|
+
y = self.layer_norm_2(x)
|
364
|
+
y = self.mlp(y)
|
365
|
+
y = self.output_layer(y, x)
|
366
|
+
|
367
|
+
return y, attention_scores
|
368
|
+
|
369
|
+
def get_config(self):
|
370
|
+
config = super().get_config()
|
371
|
+
config.update(
|
372
|
+
{
|
373
|
+
"num_heads": self.num_heads,
|
374
|
+
"hidden_dim": self.hidden_dim,
|
375
|
+
"intermediate_dim": self.intermediate_dim,
|
376
|
+
"key_dim": self.key_dim,
|
377
|
+
"use_mha_bias": self.use_mha_bias,
|
378
|
+
"dropout_rate": self.dropout_rate,
|
379
|
+
"attention_dropout": self.attention_dropout,
|
380
|
+
"layer_norm_epsilon": self.layer_norm_epsilon,
|
381
|
+
}
|
382
|
+
)
|
383
|
+
return config
|
384
|
+
|
385
|
+
|
386
|
+
class DeiTEncoder(keras.layers.Layer):
|
387
|
+
"""DeiT Encoder class.
|
388
|
+
Args:
|
389
|
+
num_layers: int. Number of Transformer encoder blocks.
|
390
|
+
num_heads: int. Number of attention heads.
|
391
|
+
hidden_dim: int. Dimensionality of the hidden representations.
|
392
|
+
intermediate_dim: int. Dimensionality of the intermediate MLP layer.
|
393
|
+
use_mha_bias: bool. Whether to use bias in the multi-head attention
|
394
|
+
layer. Defaults to `True`.
|
395
|
+
dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
|
396
|
+
`0.0`.
|
397
|
+
attention_dropout: float. Dropout rate for the attention mechanism.
|
398
|
+
Between 0 and 1. Defaults to `0.0`.
|
399
|
+
layer_norm_epsilon: float. Small float value for layer normalization
|
400
|
+
stability. Defaults to `1e-6`.
|
401
|
+
**kwargs: Additional keyword arguments passed to `keras.layers.Layer`
|
402
|
+
"""
|
403
|
+
|
404
|
+
def __init__(
|
405
|
+
self,
|
406
|
+
num_layers,
|
407
|
+
num_heads,
|
408
|
+
hidden_dim,
|
409
|
+
intermediate_dim,
|
410
|
+
use_mha_bias=True,
|
411
|
+
dropout_rate=0.0,
|
412
|
+
attention_dropout=0.0,
|
413
|
+
layer_norm_epsilon=1e-6,
|
414
|
+
**kwargs,
|
415
|
+
):
|
416
|
+
super().__init__(**kwargs)
|
417
|
+
|
418
|
+
# === Config ===
|
419
|
+
self.num_layers = num_layers
|
420
|
+
self.num_heads = num_heads
|
421
|
+
self.hidden_dim = hidden_dim
|
422
|
+
self.intermediate_dim = intermediate_dim
|
423
|
+
self.use_mha_bias = use_mha_bias
|
424
|
+
self.dropout_rate = dropout_rate
|
425
|
+
self.attention_dropout = attention_dropout
|
426
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
427
|
+
|
428
|
+
def build(self, input_shape):
|
429
|
+
self.encoder_layers = []
|
430
|
+
for i in range(self.num_layers):
|
431
|
+
encoder_block = DeiTEncoderBlock(
|
432
|
+
num_heads=self.num_heads,
|
433
|
+
hidden_dim=self.hidden_dim,
|
434
|
+
intermediate_dim=self.intermediate_dim,
|
435
|
+
use_mha_bias=self.use_mha_bias,
|
436
|
+
dropout_rate=self.dropout_rate,
|
437
|
+
attention_dropout=self.attention_dropout,
|
438
|
+
layer_norm_epsilon=self.layer_norm_epsilon,
|
439
|
+
dtype=self.dtype_policy,
|
440
|
+
name=f"transformer_block_{i + 1}",
|
441
|
+
)
|
442
|
+
encoder_block.build((None, None, self.hidden_dim))
|
443
|
+
self.encoder_layers.append(encoder_block)
|
444
|
+
|
445
|
+
self.layer_norm = keras.layers.LayerNormalization(
|
446
|
+
epsilon=self.layer_norm_epsilon,
|
447
|
+
dtype=self.dtype_policy,
|
448
|
+
name="ln",
|
449
|
+
)
|
450
|
+
self.layer_norm.build((None, None, self.hidden_dim))
|
451
|
+
|
452
|
+
self.built = True
|
453
|
+
|
454
|
+
def call(
|
455
|
+
self,
|
456
|
+
hidden_states,
|
457
|
+
attention_masks=None,
|
458
|
+
output_hidden_states=False,
|
459
|
+
return_attention_scores=False,
|
460
|
+
):
|
461
|
+
seq_len = ops.shape(hidden_states)[1] # Sequence length
|
462
|
+
hidden_dim = ops.shape(hidden_states)[2] # Hidden size
|
463
|
+
|
464
|
+
# Ensure valid tensor output even if disabled
|
465
|
+
all_hidden_states = (
|
466
|
+
ops.empty(shape=(0, seq_len, hidden_dim), dtype=hidden_states.dtype)
|
467
|
+
if not output_hidden_states
|
468
|
+
else ()
|
469
|
+
)
|
470
|
+
|
471
|
+
all_self_attentions_scores = (
|
472
|
+
ops.empty(
|
473
|
+
shape=(0, self.num_heads, seq_len, seq_len),
|
474
|
+
dtype=hidden_states.dtype,
|
475
|
+
)
|
476
|
+
if not return_attention_scores
|
477
|
+
else ()
|
478
|
+
)
|
479
|
+
|
480
|
+
for i in range(self.num_layers):
|
481
|
+
attention_mask = (
|
482
|
+
attention_masks[i] if attention_masks is not None else None
|
483
|
+
)
|
484
|
+
if output_hidden_states:
|
485
|
+
all_hidden_states = all_hidden_states + (hidden_states,)
|
486
|
+
|
487
|
+
hidden_states, scores = self.encoder_layers[i](
|
488
|
+
hidden_states,
|
489
|
+
attention_mask=attention_mask,
|
490
|
+
return_attention_scores=return_attention_scores,
|
491
|
+
)
|
492
|
+
if return_attention_scores:
|
493
|
+
all_self_attentions_scores = all_self_attentions_scores + (
|
494
|
+
scores,
|
495
|
+
)
|
496
|
+
|
497
|
+
if output_hidden_states:
|
498
|
+
all_hidden_states = all_hidden_states + (hidden_states,)
|
499
|
+
|
500
|
+
hidden_states = self.layer_norm(hidden_states)
|
501
|
+
|
502
|
+
return hidden_states, all_hidden_states, all_self_attentions_scores
|
503
|
+
|
504
|
+
def get_config(self):
|
505
|
+
config = super().get_config()
|
506
|
+
config.update(
|
507
|
+
{
|
508
|
+
"num_layers": self.num_layers,
|
509
|
+
"num_heads": self.num_heads,
|
510
|
+
"hidden_dim": self.hidden_dim,
|
511
|
+
"intermediate_dim": self.intermediate_dim,
|
512
|
+
"key_dim": self.key_dim,
|
513
|
+
"use_mha_bias": self.use_mha_bias,
|
514
|
+
"dropout_rate": self.dropout_rate,
|
515
|
+
"attention_dropout": self.attention_dropout,
|
516
|
+
"layer_norm_epsilon": self.layer_norm_epsilon,
|
517
|
+
}
|
518
|
+
)
|
519
|
+
return config
|
@@ -0,0 +1,49 @@
|
|
1
|
+
"""DeiT model preset configurations."""
|
2
|
+
|
3
|
+
# Metadata for loading pretrained model weights.
|
4
|
+
backbone_presets = {
|
5
|
+
"deit-base-distilled-patch16-384_imagenet": {
|
6
|
+
"metadata": {
|
7
|
+
"description": (
|
8
|
+
"DeiT-B16 model pre-trained on the ImageNet 1k dataset with "
|
9
|
+
"image resolution of 384x384 "
|
10
|
+
),
|
11
|
+
"params": 86092032,
|
12
|
+
"path": "deit",
|
13
|
+
},
|
14
|
+
"kaggle_handle": "kaggle://keras/deit/keras/deit_base_distilled_patch16_384_imagenet/1",
|
15
|
+
},
|
16
|
+
"deit-base-distilled-patch16-224_imagenet": {
|
17
|
+
"metadata": {
|
18
|
+
"description": (
|
19
|
+
"DeiT-B16 model pre-trained on the ImageNet 1k dataset with "
|
20
|
+
"image resolution of 224x224 "
|
21
|
+
),
|
22
|
+
"params": 85800192,
|
23
|
+
"path": "deit",
|
24
|
+
},
|
25
|
+
"kaggle_handle": "kaggle://keras/deit/keras/deit_base_distilled_patch16_224_imagenet/1",
|
26
|
+
},
|
27
|
+
"deit-tiny-distilled-patch16-224_imagenet": {
|
28
|
+
"metadata": {
|
29
|
+
"description": (
|
30
|
+
"DeiT-T16 model pre-trained on the ImageNet 1k dataset with "
|
31
|
+
"image resolution of 224x224 "
|
32
|
+
),
|
33
|
+
"params": 5524800,
|
34
|
+
"path": "deit",
|
35
|
+
},
|
36
|
+
"kaggle_handle": "kaggle://keras/deit/keras/deit_tiny_distilled_patch16_224_imagenet/1",
|
37
|
+
},
|
38
|
+
"deit-small-distilled-patch16-224_imagenet": {
|
39
|
+
"metadata": {
|
40
|
+
"description": (
|
41
|
+
"DeiT-S16 model pre-trained on the ImageNet 1k dataset with "
|
42
|
+
"image resolution of 224x224 "
|
43
|
+
),
|
44
|
+
"params": 21666432,
|
45
|
+
"path": "deit",
|
46
|
+
},
|
47
|
+
"kaggle_handle": "kaggle://keras/deit/keras/deit_small_distilled_patch16_224_imagenet/1",
|
48
|
+
},
|
49
|
+
}
|
@@ -4,8 +4,8 @@ backbone_presets = {
|
|
4
4
|
"mixtral_8_7b_en": {
|
5
5
|
"metadata": {
|
6
6
|
"description": (
|
7
|
-
"32-layer Mixtral MoE model with 7 billion"
|
8
|
-
"active parameters and 8 experts per MoE layer."
|
7
|
+
"32-layer Mixtral MoE model with 7 billion"
|
8
|
+
"active parameters and 8 experts per MoE layer."
|
9
9
|
),
|
10
10
|
"params": 46702792704,
|
11
11
|
"path": "mixtral",
|
@@ -15,8 +15,8 @@ backbone_presets = {
|
|
15
15
|
"mixtral_8_instruct_7b_en": {
|
16
16
|
"metadata": {
|
17
17
|
"description": (
|
18
|
-
"Instruction fine-tuned 32-layer Mixtral MoE model"
|
19
|
-
"with 7 billion active parameters and 8 experts per MoE layer."
|
18
|
+
"Instruction fine-tuned 32-layer Mixtral MoE model"
|
19
|
+
"with 7 billion active parameters and 8 experts per MoE layer."
|
20
20
|
),
|
21
21
|
"params": 46702792704,
|
22
22
|
"path": "mixtral",
|
@@ -28,8 +28,8 @@ backbone_presets = {
|
|
28
28
|
"qwen2.5_instruct_0.5b_en": {
|
29
29
|
"metadata": {
|
30
30
|
"description": (
|
31
|
-
"Instruction fine-tuned 24-layer Qwen model with 0.5 "
|
32
|
-
"billion parameters."
|
31
|
+
"Instruction fine-tuned 24-layer Qwen model with 0.5 "
|
32
|
+
"billion parameters."
|
33
33
|
),
|
34
34
|
"params": 494032768,
|
35
35
|
"path": "qwen",
|
@@ -39,8 +39,8 @@ backbone_presets = {
|
|
39
39
|
"qwen2.5_instruct_32b_en": {
|
40
40
|
"metadata": {
|
41
41
|
"description": (
|
42
|
-
"Instruction fine-tuned 64-layer Qwen model with 32 "
|
43
|
-
"billion parameters."
|
42
|
+
"Instruction fine-tuned 64-layer Qwen model with 32 "
|
43
|
+
"billion parameters."
|
44
44
|
),
|
45
45
|
"params": 32763876352,
|
46
46
|
"path": "qwen",
|
@@ -50,8 +50,8 @@ backbone_presets = {
|
|
50
50
|
"qwen2.5_instruct_72b_en": {
|
51
51
|
"metadata": {
|
52
52
|
"description": (
|
53
|
-
"Instruction fine-tuned 80-layer Qwen model with 72 "
|
54
|
-
"billion parameters."
|
53
|
+
"Instruction fine-tuned 80-layer Qwen model with 72 "
|
54
|
+
"billion parameters."
|
55
55
|
),
|
56
56
|
"params": 72706203648,
|
57
57
|
"path": "qwen",
|