keras-hub-nightly 0.22.0.dev202505290412__py3-none-any.whl → 0.22.0.dev202505310408__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. keras_hub/layers/__init__.py +3 -0
  2. keras_hub/models/__init__.py +16 -0
  3. keras_hub/src/models/deit/__init__.py +0 -0
  4. keras_hub/src/models/deit/deit_backbone.py +154 -0
  5. keras_hub/src/models/deit/deit_image_classifier.py +171 -0
  6. keras_hub/src/models/deit/deit_image_classifier_preprocessor.py +12 -0
  7. keras_hub/src/models/deit/deit_image_converter.py +8 -0
  8. keras_hub/src/models/deit/deit_layers.py +519 -0
  9. keras_hub/src/models/deit/deit_presets.py +49 -0
  10. keras_hub/src/models/mixtral/mixtral_presets.py +4 -4
  11. keras_hub/src/models/qwen/qwen_presets.py +6 -6
  12. keras_hub/src/models/qwen3/qwen3_attention.py +369 -0
  13. keras_hub/src/models/qwen3/qwen3_backbone.py +191 -0
  14. keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py +10 -0
  15. keras_hub/src/models/qwen3/qwen3_decoder.py +309 -0
  16. keras_hub/src/models/qwen3/qwen3_layernorm.py +38 -0
  17. keras_hub/src/models/qwen3/qwen3_tokenizer.py +48 -0
  18. keras_hub/src/models/qwen_moe/qwen_moe_presets.py +2 -2
  19. keras_hub/src/utils/transformers/convert_deit.py +155 -0
  20. keras_hub/src/utils/transformers/convert_qwen3.py +145 -0
  21. keras_hub/src/utils/transformers/preset_loader.py +7 -1
  22. keras_hub/src/version.py +1 -1
  23. {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/METADATA +1 -1
  24. {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/RECORD +26 -11
  25. {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/WHEEL +0 -0
  26. {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505310408.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,519 @@
1
+ import keras
2
+ from keras import ops
3
+
4
+ from keras_hub.src.utils.keras_utils import standardize_data_format
5
+
6
+
7
+ class DeiTEmbeddings(keras.layers.Layer):
8
+ """Patches the image and embeds the patches.
9
+
10
+ Args:
11
+ image_size: tuple. Size of the input image (height, width).
12
+ patch_size: tuple. patch_size: tuple. The size of each image
13
+ patch as (patch_height, patch_width).
14
+ hidden_dim: int. Dimensionality of the patch embeddings.
15
+ num_channels: int. Number of channels in the input image. Defaults to
16
+ `3`.
17
+ data_format: str. `"channels_last"` or `"channels_first"`. Defaults to
18
+ `None` (which uses `"channels_last"`).
19
+ use_mask_token: bool. Whether to use a mask token. Defaults to `False`.
20
+ dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
21
+ `0.0`.
22
+ **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ image_size,
28
+ patch_size,
29
+ hidden_dim,
30
+ num_channels=3,
31
+ data_format=None,
32
+ use_mask_token=False,
33
+ dropout_rate=0.0,
34
+ **kwargs,
35
+ ):
36
+ super().__init__(**kwargs)
37
+ num_patches = (image_size[0] // patch_size[0]) * (
38
+ image_size[1] // patch_size[1]
39
+ )
40
+ num_positions = num_patches + 2
41
+
42
+ # === Config ===
43
+ self.image_size = image_size
44
+ self.patch_size = patch_size
45
+ self.hidden_dim = hidden_dim
46
+ self.num_channels = num_channels
47
+ self.num_patches = num_patches
48
+ self.num_positions = num_positions
49
+ self.data_format = standardize_data_format(data_format)
50
+ self.use_mask_token = use_mask_token
51
+ self.dropout_rate = dropout_rate
52
+
53
+ def build(self, input_shape):
54
+ if self.use_mask_token:
55
+ self.mask_token = self.add_weight(
56
+ shape=(1, 1, self.hidden_dim),
57
+ initializer="zeros",
58
+ dtype=self.variable_dtype,
59
+ name="mask_token",
60
+ )
61
+ self.class_token = self.add_weight(
62
+ shape=(
63
+ 1,
64
+ 1,
65
+ self.hidden_dim,
66
+ ),
67
+ initializer="zeros",
68
+ dtype=self.variable_dtype,
69
+ name="class_token",
70
+ )
71
+ self.distillation_token = self.add_weight(
72
+ shape=(
73
+ 1,
74
+ 1,
75
+ self.hidden_dim,
76
+ ),
77
+ initializer="zeros",
78
+ dtype=self.variable_dtype,
79
+ name="distillation_token",
80
+ )
81
+ self.patch_embedding = keras.layers.Conv2D(
82
+ filters=self.hidden_dim,
83
+ kernel_size=self.patch_size,
84
+ strides=self.patch_size,
85
+ padding="valid",
86
+ activation=None,
87
+ dtype=self.dtype_policy,
88
+ data_format=self.data_format,
89
+ name="patch_embedding",
90
+ )
91
+ self.patch_embedding.build(input_shape)
92
+ self.position_embedding = self.add_weight(
93
+ shape=(
94
+ 1,
95
+ self.num_positions,
96
+ self.hidden_dim,
97
+ ), # Matches the shape in PyTorch
98
+ initializer=keras.initializers.RandomNormal(
99
+ stddev=0.02
100
+ ), # Equivalent to torch.randn()
101
+ dtype=self.variable_dtype,
102
+ trainable=True,
103
+ name="position_embedding",
104
+ )
105
+ self.dropout = keras.layers.Dropout(
106
+ self.dropout_rate, dtype=self.dtype_policy, name="dropout"
107
+ )
108
+
109
+ self.built = True
110
+
111
+ def call(self, inputs, bool_masked_pos=None):
112
+ patch_embeddings = self.patch_embedding(inputs)
113
+ if self.data_format == "channels_first":
114
+ patch_embeddings = ops.transpose(
115
+ patch_embeddings, axes=(0, 2, 3, 1)
116
+ )
117
+ embeddings_shape = ops.shape(patch_embeddings)
118
+ patch_embeddings = ops.reshape(
119
+ patch_embeddings, [embeddings_shape[0], -1, embeddings_shape[-1]]
120
+ )
121
+
122
+ if bool_masked_pos is not None and self.use_mask_token:
123
+ # Expand dimensions to match the embeddings
124
+ bool_masked_pos_expanded = ops.expand_dims(
125
+ bool_masked_pos, axis=-1
126
+ ) # (batch_size, num_patches, 1)
127
+ mask_token_expanded = ops.expand_dims(
128
+ self.mask_token, axis=0
129
+ ) # (1, 1, hidden_size)
130
+ # Apply masking
131
+ embeddings = ops.where(
132
+ bool_masked_pos_expanded, mask_token_expanded, patch_embeddings
133
+ )
134
+
135
+ class_token = ops.tile(self.class_token, (embeddings_shape[0], 1, 1))
136
+ distillation_token = ops.tile(
137
+ self.distillation_token, (embeddings_shape[0], 1, 1)
138
+ )
139
+ embeddings = ops.concatenate(
140
+ [class_token, distillation_token, patch_embeddings], axis=1
141
+ )
142
+ position_embedding = self.position_embedding
143
+ embeddings = ops.add(embeddings, position_embedding)
144
+ embeddings = self.dropout(embeddings)
145
+ return embeddings
146
+
147
+ def compute_output_shape(self, input_shape):
148
+ return (
149
+ input_shape[0],
150
+ self.num_positions,
151
+ self.hidden_dim,
152
+ )
153
+
154
+ def get_config(self):
155
+ config = super().get_config()
156
+ config.update(
157
+ {
158
+ "image_size": self.image_size,
159
+ "patch_size": self.patch_size,
160
+ "hidden_dim": self.hidden_dim,
161
+ "num_channels": self.num_channels,
162
+ "num_patches": self.num_patches,
163
+ "num_positions": self.num_positions,
164
+ "use_mask_token": self.use_mask_token,
165
+ "dropout_rate": self.dropout_rate,
166
+ }
167
+ )
168
+ return config
169
+
170
+
171
+ class DeiTIntermediate(keras.layers.Layer):
172
+ """DeiTIntermediate block.
173
+ Args:
174
+ intermediate_dim: int. Dimensionality of the intermediate MLP layer.
175
+ **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
176
+ """
177
+
178
+ def __init__(
179
+ self,
180
+ intermediate_dim,
181
+ **kwargs,
182
+ ):
183
+ super().__init__(**kwargs)
184
+
185
+ # === Config ===
186
+ self.intermediate_dim = intermediate_dim
187
+
188
+ def build(self, input_shape):
189
+ self.dense = keras.layers.Dense(
190
+ units=self.intermediate_dim,
191
+ activation="gelu",
192
+ dtype=self.dtype_policy,
193
+ name="dense",
194
+ )
195
+ self.dense.build(input_shape)
196
+ self.built = True
197
+
198
+ def call(self, inputs):
199
+ out = self.dense(inputs)
200
+ return out
201
+
202
+ def get_config(self):
203
+ config = super().get_config()
204
+ config.update(
205
+ {
206
+ "intermediate_dim": self.intermediate_dim,
207
+ }
208
+ )
209
+ return config
210
+
211
+
212
+ class DeiTOutput(keras.layers.Layer):
213
+ """DeiT Output layer implementation.
214
+ Args:
215
+ hidden_dim: int. Dimensionality of the patch embeddings.
216
+ dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
217
+ `0.0`.
218
+ **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
219
+ """
220
+
221
+ def __init__(self, hidden_dim, dropout_rate=0.1, **kwargs):
222
+ super().__init__(**kwargs)
223
+ self.hidden_dim = hidden_dim
224
+ self.dropout_rate = dropout_rate
225
+
226
+ def build(self, input_shape):
227
+ self.dense = keras.layers.Dense(
228
+ self.hidden_dim, dtype=self.dtype_policy, name="output"
229
+ )
230
+ self.dense.build(input_shape)
231
+
232
+ self.dropout = keras.layers.Dropout(
233
+ self.dropout_rate, dtype=self.dtype_policy, name="dropout"
234
+ )
235
+ # Mark this layer as built
236
+ self.built = True
237
+
238
+ def call(self, hidden_states, input_tensor):
239
+ hidden_states = self.dense(hidden_states) # Linear transformation
240
+ hidden_states = self.dropout(hidden_states) # Apply dropout
241
+ hidden_states = hidden_states + input_tensor # Residual connection
242
+ return hidden_states
243
+
244
+ def get_config(self):
245
+ config = super().get_config()
246
+ config.update(
247
+ {
248
+ "hidden_dim": self.hidden_dim,
249
+ "dropout_rate": self.dropout_rate,
250
+ }
251
+ )
252
+ return config
253
+
254
+
255
+ class DeiTEncoderBlock(keras.layers.Layer):
256
+ """DeiT encoder block.
257
+ Args:
258
+ num_heads: int. Number of attention heads.
259
+ hidden_dim: int. Dimensionality of the hidden representations.
260
+ intermediate_dim: int. Dimensionality of the intermediate MLP layer.
261
+ use_mha_bias: bool. Whether to use bias in the multi-head attention
262
+ layer. Defaults to `True`.
263
+ dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
264
+ `0.0`.
265
+ attention_dropout: float. Dropout rate for the attention mechanism.
266
+ Between 0 and 1. Defaults to `0.0`.
267
+ layer_norm_epsilon: float. Small float value for layer normalization
268
+ stability. Defaults to `1e-6`.
269
+ **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
270
+ """
271
+
272
+ def __init__(
273
+ self,
274
+ num_heads,
275
+ hidden_dim,
276
+ intermediate_dim,
277
+ use_mha_bias=True,
278
+ dropout_rate=0.0,
279
+ attention_dropout=0.0,
280
+ layer_norm_epsilon=1e-6,
281
+ **kwargs,
282
+ ):
283
+ super().__init__(**kwargs)
284
+ key_dim = hidden_dim // num_heads
285
+
286
+ # === Config ===
287
+ self.num_heads = num_heads
288
+ self.hidden_dim = hidden_dim
289
+ self.intermediate_dim = intermediate_dim
290
+ self.key_dim = key_dim
291
+ self.use_mha_bias = use_mha_bias
292
+ self.dropout_rate = dropout_rate
293
+ self.attention_dropout = attention_dropout
294
+ self.layer_norm_epsilon = layer_norm_epsilon
295
+
296
+ def build(self, input_shape):
297
+ # Attention block
298
+ self.layer_norm_1 = keras.layers.LayerNormalization(
299
+ epsilon=self.layer_norm_epsilon,
300
+ name="ln_1",
301
+ dtype=self.dtype_policy,
302
+ )
303
+ self.layer_norm_1.build(input_shape)
304
+ self.mha = keras.layers.MultiHeadAttention(
305
+ num_heads=self.num_heads,
306
+ key_dim=self.key_dim,
307
+ use_bias=self.use_mha_bias,
308
+ dropout=self.attention_dropout,
309
+ name="mha",
310
+ dtype=self.dtype_policy,
311
+ )
312
+ self.mha.build(input_shape, input_shape)
313
+
314
+ # MLP block
315
+ self.layer_norm_2 = keras.layers.LayerNormalization(
316
+ epsilon=self.layer_norm_epsilon,
317
+ name="ln_2",
318
+ dtype=self.dtype_policy,
319
+ )
320
+ self.layer_norm_2.build((None, None, self.hidden_dim))
321
+
322
+ # Intermediate Layer
323
+ self.mlp = DeiTIntermediate(
324
+ self.intermediate_dim, dtype=self.dtype_policy, name="mlp"
325
+ )
326
+ self.mlp.build((None, None, self.hidden_dim))
327
+
328
+ # Output Layer
329
+ self.output_layer = DeiTOutput(
330
+ self.hidden_dim,
331
+ self.dropout_rate,
332
+ dtype=self.dtype_policy,
333
+ name="output_layer",
334
+ )
335
+
336
+ self.output_layer.build((None, None, self.intermediate_dim))
337
+
338
+ self.built = True
339
+
340
+ def call(
341
+ self,
342
+ hidden_states,
343
+ attention_mask=None,
344
+ return_attention_scores=False,
345
+ ):
346
+ attention_scores = None
347
+ x = self.layer_norm_1(hidden_states)
348
+ if return_attention_scores:
349
+ x, attention_scores = self.mha(
350
+ x,
351
+ x,
352
+ attention_mask=attention_mask,
353
+ return_attention_scores=return_attention_scores,
354
+ )
355
+ else:
356
+ x = self.mha(
357
+ x,
358
+ x,
359
+ attention_mask=attention_mask,
360
+ )
361
+
362
+ x = x + hidden_states
363
+ y = self.layer_norm_2(x)
364
+ y = self.mlp(y)
365
+ y = self.output_layer(y, x)
366
+
367
+ return y, attention_scores
368
+
369
+ def get_config(self):
370
+ config = super().get_config()
371
+ config.update(
372
+ {
373
+ "num_heads": self.num_heads,
374
+ "hidden_dim": self.hidden_dim,
375
+ "intermediate_dim": self.intermediate_dim,
376
+ "key_dim": self.key_dim,
377
+ "use_mha_bias": self.use_mha_bias,
378
+ "dropout_rate": self.dropout_rate,
379
+ "attention_dropout": self.attention_dropout,
380
+ "layer_norm_epsilon": self.layer_norm_epsilon,
381
+ }
382
+ )
383
+ return config
384
+
385
+
386
+ class DeiTEncoder(keras.layers.Layer):
387
+ """DeiT Encoder class.
388
+ Args:
389
+ num_layers: int. Number of Transformer encoder blocks.
390
+ num_heads: int. Number of attention heads.
391
+ hidden_dim: int. Dimensionality of the hidden representations.
392
+ intermediate_dim: int. Dimensionality of the intermediate MLP layer.
393
+ use_mha_bias: bool. Whether to use bias in the multi-head attention
394
+ layer. Defaults to `True`.
395
+ dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to
396
+ `0.0`.
397
+ attention_dropout: float. Dropout rate for the attention mechanism.
398
+ Between 0 and 1. Defaults to `0.0`.
399
+ layer_norm_epsilon: float. Small float value for layer normalization
400
+ stability. Defaults to `1e-6`.
401
+ **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
402
+ """
403
+
404
+ def __init__(
405
+ self,
406
+ num_layers,
407
+ num_heads,
408
+ hidden_dim,
409
+ intermediate_dim,
410
+ use_mha_bias=True,
411
+ dropout_rate=0.0,
412
+ attention_dropout=0.0,
413
+ layer_norm_epsilon=1e-6,
414
+ **kwargs,
415
+ ):
416
+ super().__init__(**kwargs)
417
+
418
+ # === Config ===
419
+ self.num_layers = num_layers
420
+ self.num_heads = num_heads
421
+ self.hidden_dim = hidden_dim
422
+ self.intermediate_dim = intermediate_dim
423
+ self.use_mha_bias = use_mha_bias
424
+ self.dropout_rate = dropout_rate
425
+ self.attention_dropout = attention_dropout
426
+ self.layer_norm_epsilon = layer_norm_epsilon
427
+
428
+ def build(self, input_shape):
429
+ self.encoder_layers = []
430
+ for i in range(self.num_layers):
431
+ encoder_block = DeiTEncoderBlock(
432
+ num_heads=self.num_heads,
433
+ hidden_dim=self.hidden_dim,
434
+ intermediate_dim=self.intermediate_dim,
435
+ use_mha_bias=self.use_mha_bias,
436
+ dropout_rate=self.dropout_rate,
437
+ attention_dropout=self.attention_dropout,
438
+ layer_norm_epsilon=self.layer_norm_epsilon,
439
+ dtype=self.dtype_policy,
440
+ name=f"transformer_block_{i + 1}",
441
+ )
442
+ encoder_block.build((None, None, self.hidden_dim))
443
+ self.encoder_layers.append(encoder_block)
444
+
445
+ self.layer_norm = keras.layers.LayerNormalization(
446
+ epsilon=self.layer_norm_epsilon,
447
+ dtype=self.dtype_policy,
448
+ name="ln",
449
+ )
450
+ self.layer_norm.build((None, None, self.hidden_dim))
451
+
452
+ self.built = True
453
+
454
+ def call(
455
+ self,
456
+ hidden_states,
457
+ attention_masks=None,
458
+ output_hidden_states=False,
459
+ return_attention_scores=False,
460
+ ):
461
+ seq_len = ops.shape(hidden_states)[1] # Sequence length
462
+ hidden_dim = ops.shape(hidden_states)[2] # Hidden size
463
+
464
+ # Ensure valid tensor output even if disabled
465
+ all_hidden_states = (
466
+ ops.empty(shape=(0, seq_len, hidden_dim), dtype=hidden_states.dtype)
467
+ if not output_hidden_states
468
+ else ()
469
+ )
470
+
471
+ all_self_attentions_scores = (
472
+ ops.empty(
473
+ shape=(0, self.num_heads, seq_len, seq_len),
474
+ dtype=hidden_states.dtype,
475
+ )
476
+ if not return_attention_scores
477
+ else ()
478
+ )
479
+
480
+ for i in range(self.num_layers):
481
+ attention_mask = (
482
+ attention_masks[i] if attention_masks is not None else None
483
+ )
484
+ if output_hidden_states:
485
+ all_hidden_states = all_hidden_states + (hidden_states,)
486
+
487
+ hidden_states, scores = self.encoder_layers[i](
488
+ hidden_states,
489
+ attention_mask=attention_mask,
490
+ return_attention_scores=return_attention_scores,
491
+ )
492
+ if return_attention_scores:
493
+ all_self_attentions_scores = all_self_attentions_scores + (
494
+ scores,
495
+ )
496
+
497
+ if output_hidden_states:
498
+ all_hidden_states = all_hidden_states + (hidden_states,)
499
+
500
+ hidden_states = self.layer_norm(hidden_states)
501
+
502
+ return hidden_states, all_hidden_states, all_self_attentions_scores
503
+
504
+ def get_config(self):
505
+ config = super().get_config()
506
+ config.update(
507
+ {
508
+ "num_layers": self.num_layers,
509
+ "num_heads": self.num_heads,
510
+ "hidden_dim": self.hidden_dim,
511
+ "intermediate_dim": self.intermediate_dim,
512
+ "key_dim": self.key_dim,
513
+ "use_mha_bias": self.use_mha_bias,
514
+ "dropout_rate": self.dropout_rate,
515
+ "attention_dropout": self.attention_dropout,
516
+ "layer_norm_epsilon": self.layer_norm_epsilon,
517
+ }
518
+ )
519
+ return config
@@ -0,0 +1,49 @@
1
+ """DeiT model preset configurations."""
2
+
3
+ # Metadata for loading pretrained model weights.
4
+ backbone_presets = {
5
+ "deit-base-distilled-patch16-384_imagenet": {
6
+ "metadata": {
7
+ "description": (
8
+ "DeiT-B16 model pre-trained on the ImageNet 1k dataset with "
9
+ "image resolution of 384x384 "
10
+ ),
11
+ "params": 86092032,
12
+ "path": "deit",
13
+ },
14
+ "kaggle_handle": "kaggle://keras/deit/keras/deit_base_distilled_patch16_384_imagenet/1",
15
+ },
16
+ "deit-base-distilled-patch16-224_imagenet": {
17
+ "metadata": {
18
+ "description": (
19
+ "DeiT-B16 model pre-trained on the ImageNet 1k dataset with "
20
+ "image resolution of 224x224 "
21
+ ),
22
+ "params": 85800192,
23
+ "path": "deit",
24
+ },
25
+ "kaggle_handle": "kaggle://keras/deit/keras/deit_base_distilled_patch16_224_imagenet/1",
26
+ },
27
+ "deit-tiny-distilled-patch16-224_imagenet": {
28
+ "metadata": {
29
+ "description": (
30
+ "DeiT-T16 model pre-trained on the ImageNet 1k dataset with "
31
+ "image resolution of 224x224 "
32
+ ),
33
+ "params": 5524800,
34
+ "path": "deit",
35
+ },
36
+ "kaggle_handle": "kaggle://keras/deit/keras/deit_tiny_distilled_patch16_224_imagenet/1",
37
+ },
38
+ "deit-small-distilled-patch16-224_imagenet": {
39
+ "metadata": {
40
+ "description": (
41
+ "DeiT-S16 model pre-trained on the ImageNet 1k dataset with "
42
+ "image resolution of 224x224 "
43
+ ),
44
+ "params": 21666432,
45
+ "path": "deit",
46
+ },
47
+ "kaggle_handle": "kaggle://keras/deit/keras/deit_small_distilled_patch16_224_imagenet/1",
48
+ },
49
+ }
@@ -4,8 +4,8 @@ backbone_presets = {
4
4
  "mixtral_8_7b_en": {
5
5
  "metadata": {
6
6
  "description": (
7
- "32-layer Mixtral MoE model with 7 billion",
8
- "active parameters and 8 experts per MoE layer.",
7
+ "32-layer Mixtral MoE model with 7 billion"
8
+ "active parameters and 8 experts per MoE layer."
9
9
  ),
10
10
  "params": 46702792704,
11
11
  "path": "mixtral",
@@ -15,8 +15,8 @@ backbone_presets = {
15
15
  "mixtral_8_instruct_7b_en": {
16
16
  "metadata": {
17
17
  "description": (
18
- "Instruction fine-tuned 32-layer Mixtral MoE model",
19
- "with 7 billion active parameters and 8 experts per MoE layer.",
18
+ "Instruction fine-tuned 32-layer Mixtral MoE model"
19
+ "with 7 billion active parameters and 8 experts per MoE layer."
20
20
  ),
21
21
  "params": 46702792704,
22
22
  "path": "mixtral",
@@ -28,8 +28,8 @@ backbone_presets = {
28
28
  "qwen2.5_instruct_0.5b_en": {
29
29
  "metadata": {
30
30
  "description": (
31
- "Instruction fine-tuned 24-layer Qwen model with 0.5 ",
32
- "billion parameters.",
31
+ "Instruction fine-tuned 24-layer Qwen model with 0.5 "
32
+ "billion parameters."
33
33
  ),
34
34
  "params": 494032768,
35
35
  "path": "qwen",
@@ -39,8 +39,8 @@ backbone_presets = {
39
39
  "qwen2.5_instruct_32b_en": {
40
40
  "metadata": {
41
41
  "description": (
42
- "Instruction fine-tuned 64-layer Qwen model with 32 ",
43
- "billion parameters.",
42
+ "Instruction fine-tuned 64-layer Qwen model with 32 "
43
+ "billion parameters."
44
44
  ),
45
45
  "params": 32763876352,
46
46
  "path": "qwen",
@@ -50,8 +50,8 @@ backbone_presets = {
50
50
  "qwen2.5_instruct_72b_en": {
51
51
  "metadata": {
52
52
  "description": (
53
- "Instruction fine-tuned 80-layer Qwen model with 72 ",
54
- "billion parameters.",
53
+ "Instruction fine-tuned 80-layer Qwen model with 72 "
54
+ "billion parameters."
55
55
  ),
56
56
  "params": 72706203648,
57
57
  "path": "qwen",