keras-hub 0.25.0.dev0__py3-none-any.whl → 0.26.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +21 -0
- keras_hub/models/__init__.py +27 -0
- keras_hub/src/layers/modeling/non_max_supression.py +5 -2
- keras_hub/src/layers/modeling/reversible_embedding.py +2 -275
- keras_hub/src/layers/modeling/token_and_position_embedding.py +6 -6
- keras_hub/src/layers/modeling/transformer_layer_utils.py +9 -9
- keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +3 -1
- keras_hub/src/layers/preprocessing/multi_segment_packer.py +3 -1
- keras_hub/src/models/albert/albert_backbone.py +1 -3
- keras_hub/src/models/backbone.py +3 -0
- keras_hub/src/models/bart/bart_backbone.py +1 -3
- keras_hub/src/models/bert/bert_backbone.py +2 -4
- keras_hub/src/models/bloom/bloom_backbone.py +1 -3
- keras_hub/src/models/causal_lm.py +2 -2
- keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -3
- keras_hub/src/models/edrec/edrec_backbone.py +147 -0
- keras_hub/src/models/edrec/edrec_layers.py +434 -0
- keras_hub/src/models/edrec/edrec_seq2seq_lm.py +273 -0
- keras_hub/src/models/electra/electra_backbone.py +1 -3
- keras_hub/src/models/f_net/f_net_backbone.py +1 -3
- keras_hub/src/models/falcon/falcon_backbone.py +1 -3
- keras_hub/src/models/flux/flux_layers.py +3 -3
- keras_hub/src/models/flux/flux_maths.py +29 -15
- keras_hub/src/models/gemma/gemma_backbone.py +1 -3
- keras_hub/src/models/gemma/gemma_causal_lm.py +1 -1
- keras_hub/src/models/gemma3/gemma3_attention.py +1 -1
- keras_hub/src/models/gemma3/gemma3_backbone.py +70 -8
- keras_hub/src/models/gemma3/gemma3_causal_lm.py +16 -1
- keras_hub/src/models/gemma3/gemma3_decoder_block.py +23 -3
- keras_hub/src/models/gemma3/{gemma3_interleave_embeddings.py → gemma3_layers.py} +101 -0
- keras_hub/src/models/gemma3/gemma3_presets.py +79 -7
- keras_hub/src/models/gemma3/gemma3_vision_encoder.py +1 -1
- keras_hub/src/models/gpt2/gpt2_backbone.py +1 -3
- keras_hub/src/models/gpt2/gpt2_causal_lm.py +1 -1
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +1 -3
- keras_hub/src/models/gpt_oss/gpt_oss_backbone.py +1 -3
- keras_hub/src/models/llama/llama_backbone.py +1 -3
- keras_hub/src/models/masked_lm.py +1 -1
- keras_hub/src/models/mistral/mistral_backbone.py +1 -3
- keras_hub/src/models/mixtral/mixtral_backbone.py +1 -3
- keras_hub/src/models/moonshine/moonshine_backbone.py +1 -3
- keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +1 -3
- keras_hub/src/models/parseq/parseq_tokenizer.py +3 -1
- keras_hub/src/models/phi3/phi3_backbone.py +1 -3
- keras_hub/src/models/qwen/qwen_backbone.py +1 -3
- keras_hub/src/models/qwen/qwen_presets.py +209 -0
- keras_hub/src/models/qwen3/qwen3_backbone.py +1 -3
- keras_hub/src/models/qwen3_moe/qwen3_moe_backbone.py +1 -3
- keras_hub/src/models/qwen3_moe/qwen3_moe_presets.py +15 -0
- keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +1 -3
- keras_hub/src/models/roformer_v2/roformer_v2_backbone.py +1 -3
- keras_hub/src/models/rqvae/__init__.py +5 -0
- keras_hub/src/models/rqvae/rqvae_backbone.py +167 -0
- keras_hub/src/models/rqvae/rqvae_layers.py +335 -0
- keras_hub/src/models/rwkv7/__init__.py +5 -0
- keras_hub/src/models/rwkv7/rwkv7_backbone.py +180 -0
- keras_hub/src/models/rwkv7/rwkv7_causal_lm.py +259 -0
- keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py +214 -0
- keras_hub/src/models/rwkv7/rwkv7_layer.py +724 -0
- keras_hub/src/models/rwkv7/rwkv7_presets.py +26 -0
- keras_hub/src/models/rwkv7/rwkv7_tokenizer.py +495 -0
- keras_hub/src/models/sam/sam_backbone.py +5 -1
- keras_hub/src/models/sam/sam_prompt_encoder.py +1 -1
- keras_hub/src/models/sam3/__init__.py +7 -0
- keras_hub/src/models/sam3/roi_align.py +222 -0
- keras_hub/src/models/sam3/sam3_detr_decoder.py +641 -0
- keras_hub/src/models/sam3/sam3_detr_encoder.py +293 -0
- keras_hub/src/models/sam3/sam3_dot_product_scoring.py +120 -0
- keras_hub/src/models/sam3/sam3_geometry_encoder.py +517 -0
- keras_hub/src/models/sam3/sam3_image_converter.py +10 -0
- keras_hub/src/models/sam3/sam3_layers.py +814 -0
- keras_hub/src/models/sam3/sam3_mask_decoder.py +374 -0
- keras_hub/src/models/sam3/sam3_pc_backbone.py +306 -0
- keras_hub/src/models/sam3/sam3_pc_image_segmenter.py +282 -0
- keras_hub/src/models/sam3/sam3_pc_image_segmenter_preprocessor.py +336 -0
- keras_hub/src/models/sam3/sam3_presets.py +16 -0
- keras_hub/src/models/sam3/sam3_text_encoder.py +212 -0
- keras_hub/src/models/sam3/sam3_tokenizer.py +65 -0
- keras_hub/src/models/sam3/sam3_utils.py +134 -0
- keras_hub/src/models/sam3/sam3_vision_encoder.py +738 -0
- keras_hub/src/models/segformer/segformer_backbone.py +6 -6
- keras_hub/src/models/siglip/siglip_layers.py +1 -3
- keras_hub/src/models/smollm3/smollm3_backbone.py +1 -3
- keras_hub/src/models/stable_diffusion_3/t5_encoder.py +1 -3
- keras_hub/src/models/t5/t5_backbone.py +1 -3
- keras_hub/src/models/t5gemma/t5gemma_backbone.py +1 -3
- keras_hub/src/models/task.py +1 -1
- keras_hub/src/tests/test_case.py +394 -3
- keras_hub/src/tokenizers/byte_pair_tokenizer.py +33 -2
- keras_hub/src/tokenizers/byte_tokenizer.py +3 -1
- keras_hub/src/tokenizers/sentence_piece_tokenizer.py +15 -1
- keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +3 -1
- keras_hub/src/tokenizers/word_piece_tokenizer.py +15 -1
- keras_hub/src/utils/preset_utils.py +1 -1
- keras_hub/src/utils/tensor_utils.py +12 -0
- keras_hub/src/utils/transformers/convert_gemma3.py +68 -22
- keras_hub/src/utils/transformers/convert_qwen3_moe.py +4 -1
- keras_hub/src/utils/transformers/convert_sam3.py +472 -0
- keras_hub/src/utils/transformers/export/gemma3.py +196 -0
- keras_hub/src/utils/transformers/export/hf_exporter.py +86 -25
- keras_hub/src/utils/transformers/export/qwen.py +136 -0
- keras_hub/src/utils/transformers/preset_loader.py +15 -1
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +6 -0
- {keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/METADATA +6 -13
- {keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/RECORD +108 -76
- {keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/WHEEL +1 -1
- keras_hub/src/models/gemma3/rms_normalization.py +0 -26
- {keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,107 @@ import keras
|
|
|
2
2
|
from keras import ops
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
class RMSNormalization(keras.layers.Layer):
|
|
6
|
+
def __init__(self, epsilon=1e-6, **kwargs):
|
|
7
|
+
super().__init__(**kwargs)
|
|
8
|
+
self.epsilon = epsilon
|
|
9
|
+
|
|
10
|
+
def build(self, input_shape):
|
|
11
|
+
self.scale = self.add_weight(
|
|
12
|
+
name="scale",
|
|
13
|
+
trainable=True,
|
|
14
|
+
shape=(input_shape[-1],),
|
|
15
|
+
initializer="zeros",
|
|
16
|
+
)
|
|
17
|
+
self.built = True
|
|
18
|
+
|
|
19
|
+
def call(self, x):
|
|
20
|
+
# Always compute normalization in float32.
|
|
21
|
+
x = ops.cast(x, "float32")
|
|
22
|
+
scale = ops.cast(self.scale, "float32")
|
|
23
|
+
var = ops.mean(ops.square(x), axis=-1, keepdims=True)
|
|
24
|
+
normed_inputs = x * ops.reciprocal(ops.sqrt(var + self.epsilon))
|
|
25
|
+
normed_inputs = normed_inputs * (1 + scale)
|
|
26
|
+
return ops.cast(normed_inputs, self.compute_dtype)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Gemma3MeanPooling(keras.layers.Layer):
|
|
30
|
+
"""Mean pooling layer that computes the average of token embeddings.
|
|
31
|
+
|
|
32
|
+
This layer correctly handles variable-length sequences by ignoring
|
|
33
|
+
padded tokens in the mean calculation, using a `padding_mask`.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
```python
|
|
37
|
+
import numpy as np
|
|
38
|
+
|
|
39
|
+
sequence_output = np.random.rand(2, 4, 8).astype("float32")
|
|
40
|
+
padding_mask = np.array([[1, 1, 1, 0], [1, 1, 0, 0]], dtype="int32")
|
|
41
|
+
mean_pool_layer = Gemma3MeanPooling()
|
|
42
|
+
pooled = mean_pool_layer([sequence_output, padding_mask])
|
|
43
|
+
# pooled.shape -> (2, 8)
|
|
44
|
+
```
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, **kwargs):
|
|
48
|
+
super().__init__(**kwargs)
|
|
49
|
+
self.supports_masking = True
|
|
50
|
+
|
|
51
|
+
def call(self, inputs, padding_mask=None):
|
|
52
|
+
"""Performs masked mean pooling on the token embeddings.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
inputs: The sequence of embeddings to pool, with a shape of
|
|
56
|
+
`(batch_size, seq_len, hidden_dim)`.
|
|
57
|
+
padding_mask: The mask indicating valid tokens, with a shape of
|
|
58
|
+
`(batch_size, seq_len)`.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
A tensor representing the pooled embeddings, with a shape of
|
|
62
|
+
`(batch_size, hidden_dim)`.
|
|
63
|
+
"""
|
|
64
|
+
if padding_mask is None:
|
|
65
|
+
inputs, padding_mask = inputs
|
|
66
|
+
|
|
67
|
+
sequence_output = inputs
|
|
68
|
+
mask = ops.expand_dims(
|
|
69
|
+
ops.cast(padding_mask, sequence_output.dtype), axis=-1
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
masked_output = sequence_output * mask
|
|
73
|
+
|
|
74
|
+
sum_embeddings = ops.sum(masked_output, axis=1)
|
|
75
|
+
|
|
76
|
+
num_tokens = ops.sum(
|
|
77
|
+
ops.cast(padding_mask, sequence_output.dtype), axis=1
|
|
78
|
+
)
|
|
79
|
+
num_tokens = ops.expand_dims(num_tokens, axis=1)
|
|
80
|
+
# Avoid division by zero
|
|
81
|
+
num_tokens = ops.maximum(num_tokens, 1e-9)
|
|
82
|
+
|
|
83
|
+
mean_embeddings = sum_embeddings / num_tokens
|
|
84
|
+
return ops.cast(mean_embeddings, self.compute_dtype)
|
|
85
|
+
|
|
86
|
+
def compute_output_shape(self, input_shape):
|
|
87
|
+
"""Computes the output shape of the layer.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
input_shape: A tuple or list of tuples representing input shapes.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A tuple representing the output shape.
|
|
94
|
+
"""
|
|
95
|
+
if isinstance(input_shape, list):
|
|
96
|
+
sequence_output_shape = input_shape[0]
|
|
97
|
+
else:
|
|
98
|
+
sequence_output_shape = input_shape
|
|
99
|
+
return sequence_output_shape[:-2] + (sequence_output_shape[-1],)
|
|
100
|
+
|
|
101
|
+
def get_config(self):
|
|
102
|
+
"""Returns the config of the layer."""
|
|
103
|
+
return super().get_config()
|
|
104
|
+
|
|
105
|
+
|
|
5
106
|
class Gemma3InterleaveEmbeddings(keras.layers.Layer):
|
|
6
107
|
"""Places image embeddings in the correct position in an embedding sequence.
|
|
7
108
|
|
|
@@ -181,12 +181,25 @@ backbone_presets = {
|
|
|
181
181
|
},
|
|
182
182
|
"kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_270m/4",
|
|
183
183
|
},
|
|
184
|
+
"medgemma_4b": {
|
|
185
|
+
"metadata": {
|
|
186
|
+
"description": (
|
|
187
|
+
"A 4 billion parameter model based on Gemma 3. "
|
|
188
|
+
"This model is pre-trained for performance on medical text "
|
|
189
|
+
"and image comprehension and is optimized for medical "
|
|
190
|
+
"applications that involve a text generation component."
|
|
191
|
+
),
|
|
192
|
+
"params": 4300079472,
|
|
193
|
+
"path": "gemma3",
|
|
194
|
+
},
|
|
195
|
+
"kaggle_handle": "kaggle://keras/medgemma/keras/medgemma_4b/1",
|
|
196
|
+
},
|
|
184
197
|
"medgemma_instruct_4b": {
|
|
185
198
|
"metadata": {
|
|
186
199
|
"description": (
|
|
187
200
|
"A 4 billion parameter model based on Gemma 3. "
|
|
188
|
-
"This model is
|
|
189
|
-
"and image comprehension and is optimized for medical"
|
|
201
|
+
"This model is instruction-tuned for performance on medical "
|
|
202
|
+
"text and image comprehension and is optimized for medical "
|
|
190
203
|
"applications that involve a text generation component."
|
|
191
204
|
),
|
|
192
205
|
"params": 4300079472,
|
|
@@ -198,8 +211,8 @@ backbone_presets = {
|
|
|
198
211
|
"metadata": {
|
|
199
212
|
"description": (
|
|
200
213
|
"A 27 billion parameter model based on Gemma 3. "
|
|
201
|
-
"This model
|
|
202
|
-
"and image comprehension and is optimized for medical "
|
|
214
|
+
"This model is instruction-tuned for performance on medical "
|
|
215
|
+
" text and image comprehension and is optimized for medical "
|
|
203
216
|
"applications that involve a text generation component."
|
|
204
217
|
),
|
|
205
218
|
"params": 27432406640,
|
|
@@ -211,13 +224,72 @@ backbone_presets = {
|
|
|
211
224
|
"metadata": {
|
|
212
225
|
"description": (
|
|
213
226
|
"A 27 billion parameter text-only model based on Gemma 3. "
|
|
214
|
-
"This model is
|
|
215
|
-
"comprehension and is optimized for medical
|
|
216
|
-
"that involve a text generation component."
|
|
227
|
+
"This model is instruction-tuned (No images) for performance "
|
|
228
|
+
"on medical text comprehension and is optimized for medical "
|
|
229
|
+
"applications that involve a text generation component."
|
|
217
230
|
),
|
|
218
231
|
"params": 27009002240,
|
|
219
232
|
"path": "gemma3",
|
|
220
233
|
},
|
|
221
234
|
"kaggle_handle": "kaggle://keras/medgemma/keras/medgemma_instruct_27b_text/1",
|
|
222
235
|
},
|
|
236
|
+
"medgemma_1.5_instruct_4b": {
|
|
237
|
+
"metadata": {
|
|
238
|
+
"description": (
|
|
239
|
+
"A 4 billion parameter,Instruct-tuned MedGemma 1.5 4B is an "
|
|
240
|
+
"updated version of the Instruction-tuned MedGemma 4B model."
|
|
241
|
+
),
|
|
242
|
+
"params": 4300079472,
|
|
243
|
+
"path": "gemma3",
|
|
244
|
+
},
|
|
245
|
+
"kaggle_handle": "kaggle://keras/medgemma/keras/medgemma_1.5_instruct_4b/1",
|
|
246
|
+
},
|
|
247
|
+
"function_gemma_instruct_270m": {
|
|
248
|
+
"metadata": {
|
|
249
|
+
"description": (
|
|
250
|
+
"A 270M Million parameter text-only model based on Gemma 3. "
|
|
251
|
+
"This model is trained specifically for function calling "
|
|
252
|
+
"improvements."
|
|
253
|
+
),
|
|
254
|
+
"params": 268098176,
|
|
255
|
+
"path": "gemma3",
|
|
256
|
+
},
|
|
257
|
+
"kaggle_handle": "kaggle://keras/function-gemma/keras/function_gemma_instruct_270m/1",
|
|
258
|
+
},
|
|
259
|
+
"translategemma_4b_it": {
|
|
260
|
+
"metadata": {
|
|
261
|
+
"description": (
|
|
262
|
+
"4 billion parameter, 34-layer, multimodal instruction-tuned "
|
|
263
|
+
"translation model based on Gemma 3. Supports text and image "
|
|
264
|
+
"input for translation across 55 languages."
|
|
265
|
+
),
|
|
266
|
+
"params": 4299915632,
|
|
267
|
+
"path": "gemma3",
|
|
268
|
+
},
|
|
269
|
+
"kaggle_handle": "kaggle://keras/translategemma/keras/translategemma_4b_it/1",
|
|
270
|
+
},
|
|
271
|
+
"translategemma_12b_it": {
|
|
272
|
+
"metadata": {
|
|
273
|
+
"description": (
|
|
274
|
+
"12 billion parameter, 48-layer, multimodal instruction-tuned "
|
|
275
|
+
"translation model based on Gemma 3. Supports text and image "
|
|
276
|
+
"input for translation across 55 languages."
|
|
277
|
+
),
|
|
278
|
+
"params": 12187079280,
|
|
279
|
+
"path": "gemma3",
|
|
280
|
+
},
|
|
281
|
+
"kaggle_handle": "kaggle://keras/translategemma/keras/translategemma_12b_it/1",
|
|
282
|
+
},
|
|
283
|
+
"translategemma_27b_it": {
|
|
284
|
+
"metadata": {
|
|
285
|
+
"description": (
|
|
286
|
+
"27 billion parameter, 62-layer, multimodal instruction-tuned "
|
|
287
|
+
"translation model based on Gemma 3. Supports text and image "
|
|
288
|
+
"input for translation across 55 languages."
|
|
289
|
+
),
|
|
290
|
+
"params": 27432062576,
|
|
291
|
+
"path": "gemma3",
|
|
292
|
+
},
|
|
293
|
+
"kaggle_handle": "kaggle://keras/translategemma/keras/translategemma_27b_it/1",
|
|
294
|
+
},
|
|
223
295
|
}
|
|
@@ -2,7 +2,7 @@ import keras
|
|
|
2
2
|
from keras import ops
|
|
3
3
|
|
|
4
4
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.models.
|
|
5
|
+
from keras_hub.src.models.gemma3.gemma3_layers import RMSNormalization
|
|
6
6
|
from keras_hub.src.utils.keras_utils import clone_initializer
|
|
7
7
|
|
|
8
8
|
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
|
+
from keras.layers import ReversibleEmbedding
|
|
2
3
|
|
|
3
4
|
from keras_hub.src.api_export import keras_hub_export
|
|
4
5
|
from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.layers.modeling.transformer_decoder import TransformerDecoder
|
|
9
7
|
from keras_hub.src.models.backbone import Backbone
|
|
10
8
|
from keras_hub.src.utils.keras_utils import gelu_approximate
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import keras
|
|
2
|
+
from keras.layers import ReversibleEmbedding
|
|
2
3
|
|
|
3
4
|
from keras_hub.src.api_export import keras_hub_export
|
|
4
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
5
|
-
ReversibleEmbedding,
|
|
6
|
-
)
|
|
7
5
|
from keras_hub.src.models.backbone import Backbone
|
|
8
6
|
from keras_hub.src.models.gpt_neo_x.gpt_neo_x_decoder import GPTNeoXDecoder
|
|
9
7
|
from keras_hub.src.utils.keras_utils import gelu_approximate
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import keras
|
|
2
|
+
from keras.layers import ReversibleEmbedding
|
|
2
3
|
|
|
3
4
|
from keras_hub.src.api_export import keras_hub_export
|
|
4
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
5
|
-
ReversibleEmbedding,
|
|
6
|
-
)
|
|
7
5
|
from keras_hub.src.models.backbone import Backbone
|
|
8
6
|
from keras_hub.src.models.gpt_oss.gpt_oss_decoder import (
|
|
9
7
|
GptOssTransformerDecoder,
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.llama.llama_decoder import LlamaTransformerDecoder
|
|
10
8
|
from keras_hub.src.models.llama.llama_layernorm import LlamaLayerNorm
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.mistral.mistral_layer_norm import (
|
|
10
8
|
MistralLayerNormalization,
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.mixtral.mixtral_decoder import (
|
|
10
8
|
MixtralTransformerDecoder,
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import keras
|
|
2
|
+
from keras.layers import ReversibleEmbedding
|
|
2
3
|
|
|
3
4
|
from keras_hub.src.api_export import keras_hub_export
|
|
4
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
5
|
-
ReversibleEmbedding,
|
|
6
|
-
)
|
|
7
5
|
from keras_hub.src.models.backbone import Backbone
|
|
8
6
|
from keras_hub.src.models.moonshine.moonshine_decoder import (
|
|
9
7
|
MoonshineDecoderBlock,
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.gemma.rms_normalization import RMSNormalization
|
|
10
8
|
from keras_hub.src.models.pali_gemma.pali_gemma_decoder_block import (
|
|
@@ -13,9 +13,11 @@ from keras_hub.src.utils.tensor_utils import preprocessing_function
|
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
import tensorflow as tf
|
|
16
|
-
import tensorflow_text as tf_text
|
|
17
16
|
except ImportError:
|
|
18
17
|
tf = None
|
|
18
|
+
try:
|
|
19
|
+
import tensorflow_text as tf_text
|
|
20
|
+
except ImportError:
|
|
19
21
|
tf_text = None
|
|
20
22
|
|
|
21
23
|
PARSEQ_VOCAB = list(
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import keras
|
|
2
|
+
from keras.layers import ReversibleEmbedding
|
|
2
3
|
|
|
3
4
|
from keras_hub.src.api_export import keras_hub_export
|
|
4
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
5
|
-
ReversibleEmbedding,
|
|
6
|
-
)
|
|
7
5
|
from keras_hub.src.models.backbone import Backbone
|
|
8
6
|
from keras_hub.src.models.phi3.phi3_decoder import Phi3Decoder
|
|
9
7
|
from keras_hub.src.models.phi3.phi3_layernorm import Phi3LayerNorm
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.qwen.qwen_decoder import QwenTransformerDecoder
|
|
10
8
|
from keras_hub.src.models.qwen.qwen_layernorm import QwenLayerNorm
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Qwen preset configurations."""
|
|
2
2
|
|
|
3
3
|
backbone_presets = {
|
|
4
|
+
# Qwen 2.5 Models
|
|
4
5
|
"qwen2.5_0.5b_en": {
|
|
5
6
|
"metadata": {
|
|
6
7
|
"description": ("24-layer Qwen model with 0.5 billion parameters."),
|
|
@@ -58,4 +59,212 @@ backbone_presets = {
|
|
|
58
59
|
},
|
|
59
60
|
"kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_instruct_72b_en/2",
|
|
60
61
|
},
|
|
62
|
+
# Qwen 2.5 Coder Models
|
|
63
|
+
"qwen2.5_coder_0.5b": {
|
|
64
|
+
"metadata": {
|
|
65
|
+
"description": (
|
|
66
|
+
"Code-focused fine-tuned Qwen-2.5 model with 0.5 "
|
|
67
|
+
"billion parameters."
|
|
68
|
+
),
|
|
69
|
+
"params": 494032768,
|
|
70
|
+
"path": "qwen",
|
|
71
|
+
},
|
|
72
|
+
"kaggle_handle": (
|
|
73
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_0.5b/1"
|
|
74
|
+
),
|
|
75
|
+
},
|
|
76
|
+
"qwen2.5_coder_1.5b": {
|
|
77
|
+
"metadata": {
|
|
78
|
+
"description": (
|
|
79
|
+
"Code-focused fine-tuned 28-layer Qwen-2.5 model with 1.5 "
|
|
80
|
+
"billion parameters."
|
|
81
|
+
),
|
|
82
|
+
"params": 1543434240,
|
|
83
|
+
"path": "qwen",
|
|
84
|
+
},
|
|
85
|
+
"kaggle_handle": (
|
|
86
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_1.5b/1"
|
|
87
|
+
),
|
|
88
|
+
},
|
|
89
|
+
"qwen2.5_coder_3b": {
|
|
90
|
+
"metadata": {
|
|
91
|
+
"description": (
|
|
92
|
+
"Code-focused fine-tuned Qwen-2.5 model with 3 "
|
|
93
|
+
"billion parameters."
|
|
94
|
+
),
|
|
95
|
+
"params": 3085938688,
|
|
96
|
+
"path": "qwen",
|
|
97
|
+
},
|
|
98
|
+
"kaggle_handle": (
|
|
99
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_3b/1"
|
|
100
|
+
),
|
|
101
|
+
},
|
|
102
|
+
"qwen2.5_coder_7b": {
|
|
103
|
+
"metadata": {
|
|
104
|
+
"description": (
|
|
105
|
+
"Code-focused fine-tuned Qwen-2.5 model with 7 "
|
|
106
|
+
"billion parameters."
|
|
107
|
+
),
|
|
108
|
+
"params": 6993420288,
|
|
109
|
+
"path": "qwen",
|
|
110
|
+
},
|
|
111
|
+
"kaggle_handle": (
|
|
112
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_7b/1"
|
|
113
|
+
),
|
|
114
|
+
},
|
|
115
|
+
"qwen2.5_coder_14b": {
|
|
116
|
+
"metadata": {
|
|
117
|
+
"description": (
|
|
118
|
+
"Code-focused fine-tuned Qwen-2.5 model with 14 "
|
|
119
|
+
"billion parameters."
|
|
120
|
+
),
|
|
121
|
+
"params": 14000000000,
|
|
122
|
+
"path": "qwen",
|
|
123
|
+
},
|
|
124
|
+
"kaggle_handle": (
|
|
125
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_14b/1"
|
|
126
|
+
),
|
|
127
|
+
},
|
|
128
|
+
"qwen2.5_coder_32b": {
|
|
129
|
+
"metadata": {
|
|
130
|
+
"description": (
|
|
131
|
+
"Code-focused fine-tuned Qwen-2.5 model with 32 "
|
|
132
|
+
"billion parameters."
|
|
133
|
+
),
|
|
134
|
+
"params": 32763876352,
|
|
135
|
+
"path": "qwen",
|
|
136
|
+
},
|
|
137
|
+
"kaggle_handle": (
|
|
138
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_32b/1"
|
|
139
|
+
),
|
|
140
|
+
},
|
|
141
|
+
"qwen2.5_coder_instruct_0.5b": {
|
|
142
|
+
"metadata": {
|
|
143
|
+
"description": (
|
|
144
|
+
"Instruction-tuned code-focused Qwen-2.5 model with "
|
|
145
|
+
"0.5 billion parameters."
|
|
146
|
+
),
|
|
147
|
+
"params": 494032768,
|
|
148
|
+
"path": "qwen",
|
|
149
|
+
},
|
|
150
|
+
"kaggle_handle": (
|
|
151
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_instruct_0.5b/1"
|
|
152
|
+
),
|
|
153
|
+
},
|
|
154
|
+
"qwen2.5_coder_instruct_1.5b": {
|
|
155
|
+
"metadata": {
|
|
156
|
+
"description": (
|
|
157
|
+
"Instruction-tuned code-focused Qwen-2.5 model with "
|
|
158
|
+
"1.5 billion parameters."
|
|
159
|
+
),
|
|
160
|
+
"params": 1543434240,
|
|
161
|
+
"path": "qwen",
|
|
162
|
+
},
|
|
163
|
+
"kaggle_handle": (
|
|
164
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_instruct_1.5b/1"
|
|
165
|
+
),
|
|
166
|
+
},
|
|
167
|
+
"qwen2.5_coder_instruct_3b": {
|
|
168
|
+
"metadata": {
|
|
169
|
+
"description": (
|
|
170
|
+
"Instruction-tuned code-focused Qwen-2.5 model with "
|
|
171
|
+
"3 billion parameters."
|
|
172
|
+
),
|
|
173
|
+
"params": 3085938688,
|
|
174
|
+
"path": "qwen",
|
|
175
|
+
},
|
|
176
|
+
"kaggle_handle": (
|
|
177
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_instruct_3b/1"
|
|
178
|
+
),
|
|
179
|
+
},
|
|
180
|
+
"qwen2.5_coder_instruct_7b": {
|
|
181
|
+
"metadata": {
|
|
182
|
+
"description": (
|
|
183
|
+
"Instruction-tuned code-focused Qwen-2.5 model with "
|
|
184
|
+
"7 billion parameters."
|
|
185
|
+
),
|
|
186
|
+
"params": 6993420288,
|
|
187
|
+
"path": "qwen",
|
|
188
|
+
},
|
|
189
|
+
"kaggle_handle": (
|
|
190
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_instruct_7b/1"
|
|
191
|
+
),
|
|
192
|
+
},
|
|
193
|
+
"qwen2.5_coder_instruct_14b": {
|
|
194
|
+
"metadata": {
|
|
195
|
+
"description": (
|
|
196
|
+
"Instruction-tuned code-focused Qwen-2.5 model with "
|
|
197
|
+
"14 billion parameters."
|
|
198
|
+
),
|
|
199
|
+
"params": 14000000000,
|
|
200
|
+
"path": "qwen",
|
|
201
|
+
},
|
|
202
|
+
"kaggle_handle": (
|
|
203
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_instruct_14b/1"
|
|
204
|
+
),
|
|
205
|
+
},
|
|
206
|
+
"qwen2.5_coder_instruct_32b": {
|
|
207
|
+
"metadata": {
|
|
208
|
+
"description": (
|
|
209
|
+
"Instruction-tuned code-focused Qwen-2.5 model with "
|
|
210
|
+
"32 billion parameters."
|
|
211
|
+
),
|
|
212
|
+
"params": 32763876352,
|
|
213
|
+
"path": "qwen",
|
|
214
|
+
},
|
|
215
|
+
"kaggle_handle": (
|
|
216
|
+
"kaggle://keras/qwen2-5-coder/keras/qwen2.5_coder_instruct_32b/1"
|
|
217
|
+
),
|
|
218
|
+
},
|
|
219
|
+
# Qwen 2.5 Math Models
|
|
220
|
+
"qwen2.5_math_1.5b_en": {
|
|
221
|
+
"metadata": {
|
|
222
|
+
"description": (
|
|
223
|
+
"Math-focused Qwen-2.5 model with 1.5 billion parameters."
|
|
224
|
+
),
|
|
225
|
+
"params": 1543714304,
|
|
226
|
+
"path": "qwen",
|
|
227
|
+
},
|
|
228
|
+
"kaggle_handle": (
|
|
229
|
+
"kaggle://keras/qwen2-5-math/keras/qwen2.5_math_1.5b_en/1"
|
|
230
|
+
),
|
|
231
|
+
},
|
|
232
|
+
"qwen2.5_math_instruct_1.5b_en": {
|
|
233
|
+
"metadata": {
|
|
234
|
+
"description": (
|
|
235
|
+
"Instruction-tuned math-focused Qwen-2.5 model with "
|
|
236
|
+
"1.5 billion parameters."
|
|
237
|
+
),
|
|
238
|
+
"params": 1543714304,
|
|
239
|
+
"path": "qwen",
|
|
240
|
+
},
|
|
241
|
+
"kaggle_handle": (
|
|
242
|
+
"kaggle://keras/qwen2-5-math/keras/qwen2.5_math_instruct_1.5b_en/1"
|
|
243
|
+
),
|
|
244
|
+
},
|
|
245
|
+
"qwen2.5_math_7b_en": {
|
|
246
|
+
"metadata": {
|
|
247
|
+
"description": (
|
|
248
|
+
"Math-focused Qwen-2.5 model with 7 billion parameters."
|
|
249
|
+
),
|
|
250
|
+
"params": 7615616512,
|
|
251
|
+
"path": "qwen",
|
|
252
|
+
},
|
|
253
|
+
"kaggle_handle": (
|
|
254
|
+
"kaggle://keras/qwen2-5-math/keras/qwen2.5_math_7b_en/1"
|
|
255
|
+
),
|
|
256
|
+
},
|
|
257
|
+
"qwen2.5_math_instruct_7b_en": {
|
|
258
|
+
"metadata": {
|
|
259
|
+
"description": (
|
|
260
|
+
"Instruction-tuned math-focused Qwen-2.5 model with "
|
|
261
|
+
"7 billion parameters."
|
|
262
|
+
),
|
|
263
|
+
"params": 7615616512,
|
|
264
|
+
"path": "qwen",
|
|
265
|
+
},
|
|
266
|
+
"kaggle_handle": (
|
|
267
|
+
"kaggle://keras/qwen2-5-math/keras/qwen2.5_math_instruct_7b_en/1"
|
|
268
|
+
),
|
|
269
|
+
},
|
|
61
270
|
}
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.qwen3.qwen3_decoder import Qwen3TransformerDecoder
|
|
10
8
|
from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.qwen3_moe.qwen3_moe_decoder import (
|
|
10
8
|
Qwen3MoeTransformerDecoder,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Qwen3 MoE model preset configurations."""
|
|
2
2
|
|
|
3
3
|
backbone_presets = {
|
|
4
|
+
# Qwen-3 MoE Models
|
|
4
5
|
"qwen3_moe_30b_a3b_en": {
|
|
5
6
|
"metadata": {
|
|
6
7
|
"description": (
|
|
@@ -27,4 +28,18 @@ backbone_presets = {
|
|
|
27
28
|
},
|
|
28
29
|
"kaggle_handle": "kaggle://keras/qwen-3-moe/keras/qwen3_moe_235b_a22b_en/1",
|
|
29
30
|
},
|
|
31
|
+
# Qwen-3 Coder MoE Models
|
|
32
|
+
"qwen3_coder_instruct_30b_a3b_en": {
|
|
33
|
+
"metadata": {
|
|
34
|
+
"description": (
|
|
35
|
+
"A Code-Specific Model,Mixture-of-Experts (MoE) model "
|
|
36
|
+
"has 30.5 billion total parameters with 3.3 billion "
|
|
37
|
+
"activated, built on 48 layers and utilizes 32 query "
|
|
38
|
+
"and 4 key/value attention heads with 128 experts (8 active)."
|
|
39
|
+
),
|
|
40
|
+
"params": 30532122624,
|
|
41
|
+
"path": "qwen3_moe",
|
|
42
|
+
},
|
|
43
|
+
"kaggle_handle": "kaggle://keras/qwen3-coder/keras/qwen3_coder_instruct_30b_a3b_en/1",
|
|
44
|
+
},
|
|
30
45
|
}
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import keras
|
|
2
2
|
from keras import ops
|
|
3
|
+
from keras.layers import ReversibleEmbedding
|
|
3
4
|
|
|
4
5
|
from keras_hub.src.api_export import keras_hub_export
|
|
5
|
-
from keras_hub.src.layers.modeling.reversible_embedding import (
|
|
6
|
-
ReversibleEmbedding,
|
|
7
|
-
)
|
|
8
6
|
from keras_hub.src.models.backbone import Backbone
|
|
9
7
|
from keras_hub.src.models.qwen.qwen_layernorm import QwenLayerNorm
|
|
10
8
|
from keras_hub.src.models.qwen_moe.qwen_moe_decoder import (
|