keras-hub 0.20.0.dev1__py3-none-any.whl → 0.21.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/__init__.py +15 -33
- keras_hub/layers/__init__.py +134 -0
- keras_hub/metrics/__init__.py +11 -0
- keras_hub/models/__init__.py +642 -0
- keras_hub/samplers/__init__.py +18 -0
- keras_hub/src/layers/modeling/reversible_embedding.py +25 -35
- keras_hub/src/layers/preprocessing/image_converter.py +1 -0
- keras_hub/src/layers/preprocessing/random_deletion.py +1 -1
- keras_hub/src/layers/preprocessing/random_swap.py +1 -1
- keras_hub/src/models/audio_to_text.py +66 -0
- keras_hub/src/models/audio_to_text_preprocessor.py +80 -0
- keras_hub/src/models/backbone.py +5 -2
- keras_hub/src/models/cspnet/cspnet_backbone.py +51 -26
- keras_hub/src/models/cspnet/cspnet_presets.py +38 -3
- keras_hub/src/models/falcon/falcon_backbone.py +1 -1
- keras_hub/src/models/gemma/gemma_presets.py +10 -10
- keras_hub/src/models/gemma3/gemma3_causal_lm_preprocessor.py +3 -2
- keras_hub/src/models/gemma3/gemma3_presets.py +8 -8
- keras_hub/src/models/gemma3/gemma3_vision_encoder.py +1 -1
- keras_hub/src/models/llama/llama_attention.py +24 -6
- keras_hub/src/models/llama/llama_backbone.py +50 -16
- keras_hub/src/models/llama/llama_decoder.py +20 -3
- keras_hub/src/models/llama/llama_presets.py +3 -3
- keras_hub/src/models/llama/llama_rotary_embedding.py +180 -0
- keras_hub/src/models/llama3/llama3_backbone.py +10 -2
- keras_hub/src/models/llama3/llama3_presets.py +84 -2
- keras_hub/src/models/mistral/mistral_presets.py +3 -3
- keras_hub/src/models/mixtral/__init__.py +5 -0
- keras_hub/src/models/mixtral/mixtral_attention.py +252 -0
- keras_hub/src/models/mixtral/mixtral_backbone.py +207 -0
- keras_hub/src/models/mixtral/mixtral_causal_lm.py +281 -0
- keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py +76 -0
- keras_hub/src/models/mixtral/mixtral_decoder.py +494 -0
- keras_hub/src/models/mixtral/mixtral_layer_norm.py +34 -0
- keras_hub/src/models/mixtral/mixtral_presets.py +26 -0
- keras_hub/src/models/mixtral/mixtral_tokenizer.py +21 -0
- keras_hub/src/models/moonshine/__init__.py +5 -0
- keras_hub/src/models/moonshine/moonshine_audio_converter.py +301 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text.py +383 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py +272 -0
- keras_hub/src/models/moonshine/moonshine_backbone.py +478 -0
- keras_hub/src/models/moonshine/moonshine_decoder.py +313 -0
- keras_hub/src/models/moonshine/moonshine_encoder.py +212 -0
- keras_hub/src/models/moonshine/moonshine_layers.py +239 -0
- keras_hub/src/models/moonshine/moonshine_multi_head_attention.py +355 -0
- keras_hub/src/models/moonshine/moonshine_presets.py +25 -0
- keras_hub/src/models/moonshine/moonshine_tokenizer.py +62 -0
- keras_hub/src/models/pali_gemma/pali_gemma_presets.py +11 -11
- keras_hub/src/models/pali_gemma/pali_gemma_vit.py +1 -1
- keras_hub/src/models/qwen/__init__.py +4 -0
- keras_hub/src/models/qwen/qwen_attention.py +3 -1
- keras_hub/src/models/qwen/qwen_backbone.py +8 -1
- keras_hub/src/models/qwen/qwen_causal_lm.py +7 -0
- keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py +7 -0
- keras_hub/src/models/qwen/qwen_presets.py +61 -0
- keras_hub/src/models/qwen/qwen_tokenizer.py +9 -0
- keras_hub/src/models/qwen_moe/__init__.py +5 -0
- keras_hub/src/models/qwen_moe/qwen_moe_attention.py +375 -0
- keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +373 -0
- keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py +350 -0
- keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py +17 -0
- keras_hub/src/models/qwen_moe/qwen_moe_decoder.py +625 -0
- keras_hub/src/models/qwen_moe/qwen_moe_layernorm.py +32 -0
- keras_hub/src/models/qwen_moe/qwen_moe_presets.py +15 -0
- keras_hub/src/models/qwen_moe/qwen_moe_tokenizer.py +46 -0
- keras_hub/src/models/retinanet/retinanet_image_converter.py +0 -13
- keras_hub/src/models/retinanet/retinanet_presets.py +2 -2
- keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +0 -18
- keras_hub/src/models/segformer/segformer_presets.py +12 -12
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +6 -0
- keras_hub/src/models/task.py +5 -2
- keras_hub/src/models/xception/__init__.py +5 -0
- keras_hub/src/models/xception/xception_backbone.py +188 -0
- keras_hub/src/models/xception/xception_image_classifier.py +12 -0
- keras_hub/src/models/xception/xception_image_classifier_preprocessor.py +14 -0
- keras_hub/src/models/xception/xception_image_converter.py +8 -0
- keras_hub/src/models/xception/xception_presets.py +14 -0
- keras_hub/src/tests/mocks/mock_gemma3_tokenizer.py +155 -0
- keras_hub/src/utils/coco/__init__.py +0 -0
- keras_hub/src/utils/coco/coco_utils.py +133 -0
- keras_hub/src/utils/imagenet/imagenet_utils.py +36 -0
- keras_hub/src/utils/keras_utils.py +11 -0
- keras_hub/src/utils/preset_utils.py +70 -10
- keras_hub/src/utils/tensor_utils.py +27 -1
- keras_hub/src/utils/timm/convert_cspnet.py +94 -23
- keras_hub/src/utils/timm/preset_loader.py +6 -6
- keras_hub/src/utils/transformers/convert_llama3.py +21 -1
- keras_hub/src/utils/transformers/convert_mixtral.py +139 -0
- keras_hub/src/utils/transformers/convert_qwen.py +1 -0
- keras_hub/src/utils/transformers/convert_qwen_moe.py +253 -0
- keras_hub/src/utils/transformers/preset_loader.py +6 -0
- keras_hub/src/{version_utils.py → version.py} +1 -1
- keras_hub/tokenizers/__init__.py +117 -0
- keras_hub/utils/__init__.py +21 -0
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/METADATA +6 -20
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/RECORD +98 -55
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/WHEEL +1 -1
- keras_hub/api/__init__.py +0 -15
- keras_hub/api/layers/__init__.py +0 -86
- keras_hub/api/metrics/__init__.py +0 -11
- keras_hub/api/models/__init__.py +0 -416
- keras_hub/api/samplers/__init__.py +0 -16
- keras_hub/api/tokenizers/__init__.py +0 -58
- keras_hub/api/utils/__init__.py +0 -9
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from keras_hub.src.models.mixtral.mixtral_backbone import MixtralBackbone
|
|
4
|
+
from keras_hub.src.utils.preset_utils import get_file
|
|
5
|
+
|
|
6
|
+
backbone_cls = MixtralBackbone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_backbone_config(transformers_config):
|
|
10
|
+
return {
|
|
11
|
+
"vocabulary_size": transformers_config["vocab_size"],
|
|
12
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
13
|
+
"num_query_heads": transformers_config["num_attention_heads"],
|
|
14
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
15
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
|
16
|
+
"num_key_value_heads": transformers_config["num_key_value_heads"],
|
|
17
|
+
"num_experts": transformers_config["num_local_experts"],
|
|
18
|
+
"top_k": transformers_config["num_experts_per_tok"],
|
|
19
|
+
"rope_max_wavelength": transformers_config["rope_theta"],
|
|
20
|
+
"layer_norm_epsilon": transformers_config["rms_norm_eps"],
|
|
21
|
+
"sliding_window": transformers_config["sliding_window"],
|
|
22
|
+
"output_router_logits": transformers_config["output_router_logits"],
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
27
|
+
# Embeddings
|
|
28
|
+
loader.port_weight(
|
|
29
|
+
keras_variable=backbone.get_layer("token_embedding").embeddings,
|
|
30
|
+
hf_weight_key="model.embed_tokens.weight",
|
|
31
|
+
)
|
|
32
|
+
loader.port_weight(
|
|
33
|
+
keras_variable=backbone.get_layer("token_embedding").reverse_embeddings,
|
|
34
|
+
hf_weight_key="lm_head.weight",
|
|
35
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def transpose_and_reshape(x, shape):
|
|
39
|
+
return np.reshape(np.transpose(x), shape)
|
|
40
|
+
|
|
41
|
+
for i in range(backbone.num_layers):
|
|
42
|
+
decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
|
|
43
|
+
|
|
44
|
+
# Input layernorm
|
|
45
|
+
loader.port_weight(
|
|
46
|
+
keras_variable=decoder_layer._self_attention_layernorm.scale,
|
|
47
|
+
hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Attention layers
|
|
51
|
+
## Query
|
|
52
|
+
loader.port_weight(
|
|
53
|
+
keras_variable=decoder_layer._self_attention_layer.query_dense.kernel,
|
|
54
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
|
|
55
|
+
hook_fn=transpose_and_reshape,
|
|
56
|
+
)
|
|
57
|
+
## Key
|
|
58
|
+
loader.port_weight(
|
|
59
|
+
keras_variable=decoder_layer._self_attention_layer.key_dense.kernel,
|
|
60
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
|
|
61
|
+
hook_fn=transpose_and_reshape,
|
|
62
|
+
)
|
|
63
|
+
## Value
|
|
64
|
+
loader.port_weight(
|
|
65
|
+
keras_variable=decoder_layer._self_attention_layer.value_dense.kernel,
|
|
66
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
|
|
67
|
+
hook_fn=transpose_and_reshape,
|
|
68
|
+
)
|
|
69
|
+
## Output
|
|
70
|
+
loader.port_weight(
|
|
71
|
+
keras_variable=decoder_layer._self_attention_layer.output_dense.kernel,
|
|
72
|
+
hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
|
|
73
|
+
hook_fn=transpose_and_reshape,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# MoE layers
|
|
77
|
+
# Router gate
|
|
78
|
+
loader.port_weight(
|
|
79
|
+
keras_variable=decoder_layer._sparse_moe_block._sparse_feedforward_gate_dense.kernel,
|
|
80
|
+
hf_weight_key=f"model.layers.{i}.block_sparse_moe.gate.weight",
|
|
81
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Batched experts: w1 (gate), w3 (intermediate), and w2 (output) weights
|
|
85
|
+
gate_weights_list = []
|
|
86
|
+
intermediate_weights_list = []
|
|
87
|
+
output_weights_list = []
|
|
88
|
+
for expert_idx in range(backbone.num_experts):
|
|
89
|
+
# Load w1 (gate dense) for each expert
|
|
90
|
+
w1 = loader.get_tensor(
|
|
91
|
+
f"model.layers.{i}.block_sparse_moe.experts.{expert_idx}.w1.weight"
|
|
92
|
+
)
|
|
93
|
+
w1_transposed = np.transpose(w1, axes=(1, 0))
|
|
94
|
+
gate_weights_list.append(w1_transposed)
|
|
95
|
+
|
|
96
|
+
w3 = loader.get_tensor(
|
|
97
|
+
f"model.layers.{i}.block_sparse_moe.experts.{expert_idx}.w3.weight"
|
|
98
|
+
)
|
|
99
|
+
w3_transposed = np.transpose(w3, axes=(1, 0))
|
|
100
|
+
intermediate_weights_list.append(w3_transposed)
|
|
101
|
+
|
|
102
|
+
w2 = loader.get_tensor(
|
|
103
|
+
f"model.layers.{i}.block_sparse_moe.experts.{expert_idx}.w2.weight"
|
|
104
|
+
)
|
|
105
|
+
w2_transposed = np.transpose(w2, axes=(1, 0))
|
|
106
|
+
output_weights_list.append(w2_transposed)
|
|
107
|
+
|
|
108
|
+
gate_batched = np.stack(gate_weights_list, axis=0)
|
|
109
|
+
intermediate_batched = np.stack(intermediate_weights_list, axis=0)
|
|
110
|
+
output_batched = np.stack(output_weights_list, axis=0)
|
|
111
|
+
|
|
112
|
+
# Assign batched weights to expert_bank
|
|
113
|
+
decoder_layer._sparse_moe_block.expert_bank._expert_feedforward_gate_dense.assign(
|
|
114
|
+
gate_batched
|
|
115
|
+
)
|
|
116
|
+
decoder_layer._sparse_moe_block.expert_bank._expert_feedforward_intermediate_dense.assign(
|
|
117
|
+
intermediate_batched
|
|
118
|
+
)
|
|
119
|
+
decoder_layer._sparse_moe_block.expert_bank._expert_feedforward_output_dense.assign(
|
|
120
|
+
output_batched
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Feedforward layernorm
|
|
124
|
+
loader.port_weight(
|
|
125
|
+
keras_variable=decoder_layer._feedforward_layernorm.scale,
|
|
126
|
+
hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Final normalization layer
|
|
130
|
+
loader.port_weight(
|
|
131
|
+
keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
|
|
132
|
+
hf_weight_key="model.norm.weight",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return backbone
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def convert_tokenizer(cls, preset, **kwargs):
|
|
139
|
+
return cls(get_file(preset, "tokenizer.model"), **kwargs)
|
|
@@ -18,6 +18,7 @@ def convert_backbone_config(transformers_config):
|
|
|
18
18
|
"rope_max_wavelength": transformers_config["rope_theta"],
|
|
19
19
|
"use_sliding_window": transformers_config["use_sliding_window"],
|
|
20
20
|
"sliding_window_size": transformers_config["sliding_window"],
|
|
21
|
+
"tie_word_embeddings": transformers_config["tie_word_embeddings"],
|
|
21
22
|
}
|
|
22
23
|
|
|
23
24
|
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from keras_hub.src.models.qwen_moe.qwen_moe_backbone import QwenMoeBackbone
|
|
4
|
+
from keras_hub.src.utils.preset_utils import load_json
|
|
5
|
+
|
|
6
|
+
backbone_cls = QwenMoeBackbone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_backbone_config(transformers_config):
|
|
10
|
+
return {
|
|
11
|
+
"vocabulary_size": transformers_config["vocab_size"],
|
|
12
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
13
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
14
|
+
"num_query_heads": transformers_config["num_attention_heads"],
|
|
15
|
+
"num_key_value_heads": transformers_config["num_key_value_heads"],
|
|
16
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
|
17
|
+
"moe_intermediate_dim": transformers_config["moe_intermediate_size"],
|
|
18
|
+
"shared_expert_intermediate_dim": transformers_config[
|
|
19
|
+
"shared_expert_intermediate_size"
|
|
20
|
+
],
|
|
21
|
+
"num_experts": transformers_config["num_experts"],
|
|
22
|
+
"top_k": transformers_config["num_experts_per_tok"],
|
|
23
|
+
"norm_top_k_prob": transformers_config["norm_topk_prob"],
|
|
24
|
+
"decoder_sparse_step": transformers_config["decoder_sparse_step"],
|
|
25
|
+
"layer_norm_epsilon": transformers_config["rms_norm_eps"],
|
|
26
|
+
"rope_max_wavelength": transformers_config["rope_theta"],
|
|
27
|
+
"use_sliding_window": transformers_config["use_sliding_window"],
|
|
28
|
+
"sliding_window_size": transformers_config["sliding_window"],
|
|
29
|
+
"output_router_logits": transformers_config["output_router_logits"],
|
|
30
|
+
"router_aux_loss_coefficient": transformers_config[
|
|
31
|
+
"router_aux_loss_coef"
|
|
32
|
+
],
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
37
|
+
loader.port_weight(
|
|
38
|
+
keras_variable=backbone.get_layer("token_embedding").embeddings,
|
|
39
|
+
hf_weight_key="model.embed_tokens.weight",
|
|
40
|
+
)
|
|
41
|
+
if not backbone.tie_word_embeddings:
|
|
42
|
+
loader.port_weight(
|
|
43
|
+
keras_variable=backbone.get_layer(
|
|
44
|
+
"token_embedding"
|
|
45
|
+
).reverse_embeddings,
|
|
46
|
+
hf_weight_key="lm_head.weight",
|
|
47
|
+
# rearrange_pattern="b a -> a b",
|
|
48
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def transpose_and_reshape(x, shape):
|
|
52
|
+
return np.reshape(np.transpose(x), shape)
|
|
53
|
+
|
|
54
|
+
for i in range(backbone.num_layers):
|
|
55
|
+
decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
|
|
56
|
+
|
|
57
|
+
# Input layernorm
|
|
58
|
+
loader.port_weight(
|
|
59
|
+
keras_variable=decoder_layer._self_attention_layernorm.scale,
|
|
60
|
+
hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Attention layers
|
|
64
|
+
|
|
65
|
+
## Query
|
|
66
|
+
loader.port_weight(
|
|
67
|
+
keras_variable=decoder_layer._self_attention_layer.query_dense.kernel,
|
|
68
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
|
|
69
|
+
hook_fn=transpose_and_reshape,
|
|
70
|
+
)
|
|
71
|
+
loader.port_weight(
|
|
72
|
+
keras_variable=decoder_layer._self_attention_layer.query_dense.bias,
|
|
73
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.bias",
|
|
74
|
+
hook_fn=transpose_and_reshape,
|
|
75
|
+
)
|
|
76
|
+
## Key
|
|
77
|
+
loader.port_weight(
|
|
78
|
+
keras_variable=decoder_layer._self_attention_layer.key_dense.kernel,
|
|
79
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
|
|
80
|
+
hook_fn=transpose_and_reshape,
|
|
81
|
+
)
|
|
82
|
+
loader.port_weight(
|
|
83
|
+
keras_variable=decoder_layer._self_attention_layer.key_dense.bias,
|
|
84
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.bias",
|
|
85
|
+
hook_fn=transpose_and_reshape,
|
|
86
|
+
)
|
|
87
|
+
## Value
|
|
88
|
+
loader.port_weight(
|
|
89
|
+
keras_variable=decoder_layer._self_attention_layer.value_dense.kernel,
|
|
90
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
|
|
91
|
+
hook_fn=transpose_and_reshape,
|
|
92
|
+
)
|
|
93
|
+
loader.port_weight(
|
|
94
|
+
keras_variable=decoder_layer._self_attention_layer.value_dense.bias,
|
|
95
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.bias",
|
|
96
|
+
hook_fn=transpose_and_reshape,
|
|
97
|
+
)
|
|
98
|
+
## Output
|
|
99
|
+
loader.port_weight(
|
|
100
|
+
keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
|
|
101
|
+
hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
|
|
102
|
+
# rearrange_patterns="c (a b) -> a b c",
|
|
103
|
+
# rearrange_dims={"a": backbone.num_query_heads},
|
|
104
|
+
hook_fn=transpose_and_reshape,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# MLP layers
|
|
108
|
+
if (
|
|
109
|
+
(i not in backbone.mlp_only_layers)
|
|
110
|
+
and backbone.num_experts > 0
|
|
111
|
+
and ((i + 1) % backbone.decoder_sparse_step == 0)
|
|
112
|
+
):
|
|
113
|
+
# MoE layers
|
|
114
|
+
loader.port_weight(
|
|
115
|
+
keras_variable=decoder_layer.mlp._sparse_feedforward_gate_dense.kernel,
|
|
116
|
+
hf_weight_key=f"model.layers.{i}.mlp.gate.weight",
|
|
117
|
+
# rearrange_patterns="b a -> a b",
|
|
118
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
119
|
+
hf_tensor, axes=(1, 0)
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
# Batched experts: gate_up_proj and down_proj
|
|
123
|
+
gate_up_proj_list = []
|
|
124
|
+
down_proj_list = []
|
|
125
|
+
for expert_idx in range(backbone.num_experts):
|
|
126
|
+
# Load gate_proj and up_proj for each expert
|
|
127
|
+
gate_proj = loader.get_tensor(
|
|
128
|
+
f"model.layers.{i}.mlp.experts.{expert_idx}.gate_proj.weight"
|
|
129
|
+
)
|
|
130
|
+
up_proj = loader.get_tensor(
|
|
131
|
+
f"model.layers.{i}.mlp.experts.{expert_idx}.up_proj.weight"
|
|
132
|
+
)
|
|
133
|
+
# Transpose to (hidden_dim, intermediate_dim)
|
|
134
|
+
gate_proj = np.transpose(gate_proj, axes=(1, 0))
|
|
135
|
+
up_proj = np.transpose(up_proj, axes=(1, 0))
|
|
136
|
+
# Concatenate gate_proj and up_proj along the last dimension
|
|
137
|
+
gate_up_proj = np.concatenate([gate_proj, up_proj], axis=-1)
|
|
138
|
+
gate_up_proj_list.append(gate_up_proj)
|
|
139
|
+
|
|
140
|
+
# Load down_proj for each expert
|
|
141
|
+
down_proj = loader.get_tensor(
|
|
142
|
+
f"model.layers.{i}.mlp.experts.{expert_idx}.down_proj.weight"
|
|
143
|
+
)
|
|
144
|
+
down_proj = np.transpose(
|
|
145
|
+
down_proj, axes=(1, 0)
|
|
146
|
+
) # (intermediate_dim, hidden_dim)
|
|
147
|
+
down_proj_list.append(down_proj)
|
|
148
|
+
|
|
149
|
+
# Stack the lists to create batched weights
|
|
150
|
+
gate_up_proj_batched = np.stack(
|
|
151
|
+
gate_up_proj_list, axis=0
|
|
152
|
+
) # (num_experts, hidden_dim, 2 * intermediate_dim)
|
|
153
|
+
down_proj_batched = np.stack(
|
|
154
|
+
down_proj_list, axis=0
|
|
155
|
+
) # (num_experts, intermediate_dim, hidden_dim)
|
|
156
|
+
|
|
157
|
+
# Assign batched weights to expert_bank
|
|
158
|
+
decoder_layer.mlp.expert_bank._expert_feedforward_gate_dense.assign(
|
|
159
|
+
gate_up_proj_batched
|
|
160
|
+
)
|
|
161
|
+
decoder_layer.mlp.expert_bank._expert_feedforward_output_dense.assign(
|
|
162
|
+
down_proj_batched
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
loader.port_weight(
|
|
166
|
+
keras_variable=decoder_layer.mlp.shared_expert_dense._feedforward_intermediate_dense.kernel,
|
|
167
|
+
hf_weight_key=f"model.layers.{i}.mlp.shared_expert.up_proj.weight",
|
|
168
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
169
|
+
hf_tensor, axes=(1, 0)
|
|
170
|
+
),
|
|
171
|
+
)
|
|
172
|
+
loader.port_weight(
|
|
173
|
+
keras_variable=decoder_layer.mlp.shared_expert_dense._feedforward_output_dense.kernel,
|
|
174
|
+
hf_weight_key=f"model.layers.{i}.mlp.shared_expert.down_proj.weight",
|
|
175
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
176
|
+
hf_tensor, axes=(1, 0)
|
|
177
|
+
),
|
|
178
|
+
)
|
|
179
|
+
loader.port_weight(
|
|
180
|
+
keras_variable=decoder_layer.mlp.shared_expert_dense._feedforward_gate_dense.kernel,
|
|
181
|
+
hf_weight_key=f"model.layers.{i}.mlp.shared_expert.gate_proj.weight",
|
|
182
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
183
|
+
hf_tensor, axes=(1, 0)
|
|
184
|
+
),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
loader.port_weight(
|
|
188
|
+
keras_variable=decoder_layer.mlp.shared_expert_gate_dense.kernel,
|
|
189
|
+
hf_weight_key=f"model.layers.{i}.mlp.shared_expert_gate.weight",
|
|
190
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
191
|
+
hf_tensor, axes=(1, 0)
|
|
192
|
+
),
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
loader.port_weight(
|
|
196
|
+
keras_variable=decoder_layer._feedforward_intermediate_dense.kernel,
|
|
197
|
+
hf_weight_key=f"model.layers.{i}.mlp.up_proj.weight",
|
|
198
|
+
# rearrange_patterns="b a -> a b",
|
|
199
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
200
|
+
hf_tensor, axes=(1, 0)
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
loader.port_weight(
|
|
204
|
+
keras_variable=decoder_layer._feedforward_output_dense.kernel,
|
|
205
|
+
hf_weight_key=f"model.layers.{i}.mlp.down_proj.weight",
|
|
206
|
+
# rearrange_patterns="b a -> a b",
|
|
207
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
208
|
+
hf_tensor, axes=(1, 0)
|
|
209
|
+
),
|
|
210
|
+
)
|
|
211
|
+
loader.port_weight(
|
|
212
|
+
keras_variable=decoder_layer._feedforward_gate_dense.kernel,
|
|
213
|
+
hf_weight_key=f"model.layers.{i}.mlp.gate_proj.weight",
|
|
214
|
+
# rearrange_patterns="b a -> a b",
|
|
215
|
+
hook_fn=lambda hf_tensor, _: np.transpose(
|
|
216
|
+
hf_tensor, axes=(1, 0)
|
|
217
|
+
),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Feedforward layernorm
|
|
221
|
+
loader.port_weight(
|
|
222
|
+
keras_variable=decoder_layer._feedforward_layernorm.scale,
|
|
223
|
+
hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Final normalization layer
|
|
227
|
+
loader.port_weight(
|
|
228
|
+
keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
|
|
229
|
+
hf_weight_key="model.norm.weight",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return backbone
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def convert_tokenizer(cls, preset, **kwargs):
|
|
236
|
+
tokenizer_config = load_json(preset, "tokenizer.json")
|
|
237
|
+
vocab = tokenizer_config["model"]["vocab"]
|
|
238
|
+
merges = tokenizer_config["model"]["merges"]
|
|
239
|
+
|
|
240
|
+
# Load all special tokens with the exception of "reserved" ones.
|
|
241
|
+
special_tokens = set()
|
|
242
|
+
for token in tokenizer_config["added_tokens"]:
|
|
243
|
+
if not token["content"].startswith("<|reserved_special_token_"):
|
|
244
|
+
vocab[token["content"]] = token["id"]
|
|
245
|
+
special_tokens.add(token["content"])
|
|
246
|
+
|
|
247
|
+
kwargs.update(
|
|
248
|
+
{
|
|
249
|
+
"unsplittable_tokens": list(special_tokens),
|
|
250
|
+
}
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return cls(vocabulary=vocab, merges=merges, **kwargs)
|
|
@@ -11,8 +11,10 @@ from keras_hub.src.utils.transformers import convert_gemma
|
|
|
11
11
|
from keras_hub.src.utils.transformers import convert_gpt2
|
|
12
12
|
from keras_hub.src.utils.transformers import convert_llama3
|
|
13
13
|
from keras_hub.src.utils.transformers import convert_mistral
|
|
14
|
+
from keras_hub.src.utils.transformers import convert_mixtral
|
|
14
15
|
from keras_hub.src.utils.transformers import convert_pali_gemma
|
|
15
16
|
from keras_hub.src.utils.transformers import convert_qwen
|
|
17
|
+
from keras_hub.src.utils.transformers import convert_qwen_moe
|
|
16
18
|
from keras_hub.src.utils.transformers import convert_vit
|
|
17
19
|
from keras_hub.src.utils.transformers.safetensor_utils import SafetensorLoader
|
|
18
20
|
|
|
@@ -44,6 +46,10 @@ class TransformersPresetLoader(PresetLoader):
|
|
|
44
46
|
self.converter = convert_vit
|
|
45
47
|
elif model_type == "qwen2":
|
|
46
48
|
self.converter = convert_qwen
|
|
49
|
+
elif model_type == "mixtral":
|
|
50
|
+
self.converter = convert_mixtral
|
|
51
|
+
elif model_type == "qwen2_moe":
|
|
52
|
+
self.converter = convert_qwen_moe
|
|
47
53
|
else:
|
|
48
54
|
raise ValueError(
|
|
49
55
|
"KerasHub has no converter for huggingface/transformers models "
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""DO NOT EDIT.
|
|
2
|
+
|
|
3
|
+
This file was autogenerated. Do not edit it by hand,
|
|
4
|
+
since your modifications would be overwritten.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from keras_hub.src.models.albert.albert_tokenizer import (
|
|
8
|
+
AlbertTokenizer as AlbertTokenizer,
|
|
9
|
+
)
|
|
10
|
+
from keras_hub.src.models.bart.bart_tokenizer import (
|
|
11
|
+
BartTokenizer as BartTokenizer,
|
|
12
|
+
)
|
|
13
|
+
from keras_hub.src.models.bert.bert_tokenizer import (
|
|
14
|
+
BertTokenizer as BertTokenizer,
|
|
15
|
+
)
|
|
16
|
+
from keras_hub.src.models.bloom.bloom_tokenizer import (
|
|
17
|
+
BloomTokenizer as BloomTokenizer,
|
|
18
|
+
)
|
|
19
|
+
from keras_hub.src.models.clip.clip_tokenizer import (
|
|
20
|
+
CLIPTokenizer as CLIPTokenizer,
|
|
21
|
+
)
|
|
22
|
+
from keras_hub.src.models.deberta_v3.deberta_v3_tokenizer import (
|
|
23
|
+
DebertaV3Tokenizer as DebertaV3Tokenizer,
|
|
24
|
+
)
|
|
25
|
+
from keras_hub.src.models.distil_bert.distil_bert_tokenizer import (
|
|
26
|
+
DistilBertTokenizer as DistilBertTokenizer,
|
|
27
|
+
)
|
|
28
|
+
from keras_hub.src.models.electra.electra_tokenizer import (
|
|
29
|
+
ElectraTokenizer as ElectraTokenizer,
|
|
30
|
+
)
|
|
31
|
+
from keras_hub.src.models.f_net.f_net_tokenizer import (
|
|
32
|
+
FNetTokenizer as FNetTokenizer,
|
|
33
|
+
)
|
|
34
|
+
from keras_hub.src.models.falcon.falcon_tokenizer import (
|
|
35
|
+
FalconTokenizer as FalconTokenizer,
|
|
36
|
+
)
|
|
37
|
+
from keras_hub.src.models.gemma.gemma_tokenizer import (
|
|
38
|
+
GemmaTokenizer as GemmaTokenizer,
|
|
39
|
+
)
|
|
40
|
+
from keras_hub.src.models.gemma3.gemma3_tokenizer import (
|
|
41
|
+
Gemma3Tokenizer as Gemma3Tokenizer,
|
|
42
|
+
)
|
|
43
|
+
from keras_hub.src.models.gpt2.gpt2_tokenizer import (
|
|
44
|
+
GPT2Tokenizer as GPT2Tokenizer,
|
|
45
|
+
)
|
|
46
|
+
from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
|
|
47
|
+
GPTNeoXTokenizer as GPTNeoXTokenizer,
|
|
48
|
+
)
|
|
49
|
+
from keras_hub.src.models.llama.llama_tokenizer import (
|
|
50
|
+
LlamaTokenizer as LlamaTokenizer,
|
|
51
|
+
)
|
|
52
|
+
from keras_hub.src.models.llama3.llama3_tokenizer import (
|
|
53
|
+
Llama3Tokenizer as Llama3Tokenizer,
|
|
54
|
+
)
|
|
55
|
+
from keras_hub.src.models.mistral.mistral_tokenizer import (
|
|
56
|
+
MistralTokenizer as MistralTokenizer,
|
|
57
|
+
)
|
|
58
|
+
from keras_hub.src.models.mixtral.mixtral_tokenizer import (
|
|
59
|
+
MixtralTokenizer as MixtralTokenizer,
|
|
60
|
+
)
|
|
61
|
+
from keras_hub.src.models.moonshine.moonshine_tokenizer import (
|
|
62
|
+
MoonshineTokenizer as MoonshineTokenizer,
|
|
63
|
+
)
|
|
64
|
+
from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer as OPTTokenizer
|
|
65
|
+
from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
|
|
66
|
+
PaliGemmaTokenizer as PaliGemmaTokenizer,
|
|
67
|
+
)
|
|
68
|
+
from keras_hub.src.models.phi3.phi3_tokenizer import (
|
|
69
|
+
Phi3Tokenizer as Phi3Tokenizer,
|
|
70
|
+
)
|
|
71
|
+
from keras_hub.src.models.qwen.qwen_tokenizer import (
|
|
72
|
+
QwenTokenizer as Qwen2Tokenizer,
|
|
73
|
+
)
|
|
74
|
+
from keras_hub.src.models.qwen.qwen_tokenizer import (
|
|
75
|
+
QwenTokenizer as QwenTokenizer,
|
|
76
|
+
)
|
|
77
|
+
from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import (
|
|
78
|
+
QwenMoeTokenizer as QwenMoeTokenizer,
|
|
79
|
+
)
|
|
80
|
+
from keras_hub.src.models.roberta.roberta_tokenizer import (
|
|
81
|
+
RobertaTokenizer as RobertaTokenizer,
|
|
82
|
+
)
|
|
83
|
+
from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
|
|
84
|
+
RoformerV2Tokenizer as RoformerV2Tokenizer,
|
|
85
|
+
)
|
|
86
|
+
from keras_hub.src.models.siglip.siglip_tokenizer import (
|
|
87
|
+
SigLIPTokenizer as SigLIPTokenizer,
|
|
88
|
+
)
|
|
89
|
+
from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer as T5Tokenizer
|
|
90
|
+
from keras_hub.src.models.whisper.whisper_tokenizer import (
|
|
91
|
+
WhisperTokenizer as WhisperTokenizer,
|
|
92
|
+
)
|
|
93
|
+
from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
|
|
94
|
+
XLMRobertaTokenizer as XLMRobertaTokenizer,
|
|
95
|
+
)
|
|
96
|
+
from keras_hub.src.tokenizers.byte_pair_tokenizer import (
|
|
97
|
+
BytePairTokenizer as BytePairTokenizer,
|
|
98
|
+
)
|
|
99
|
+
from keras_hub.src.tokenizers.byte_tokenizer import (
|
|
100
|
+
ByteTokenizer as ByteTokenizer,
|
|
101
|
+
)
|
|
102
|
+
from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
|
|
103
|
+
SentencePieceTokenizer as SentencePieceTokenizer,
|
|
104
|
+
)
|
|
105
|
+
from keras_hub.src.tokenizers.sentence_piece_tokenizer_trainer import (
|
|
106
|
+
compute_sentence_piece_proto as compute_sentence_piece_proto,
|
|
107
|
+
)
|
|
108
|
+
from keras_hub.src.tokenizers.tokenizer import Tokenizer as Tokenizer
|
|
109
|
+
from keras_hub.src.tokenizers.unicode_codepoint_tokenizer import (
|
|
110
|
+
UnicodeCodepointTokenizer as UnicodeCodepointTokenizer,
|
|
111
|
+
)
|
|
112
|
+
from keras_hub.src.tokenizers.word_piece_tokenizer import (
|
|
113
|
+
WordPieceTokenizer as WordPieceTokenizer,
|
|
114
|
+
)
|
|
115
|
+
from keras_hub.src.tokenizers.word_piece_tokenizer_trainer import (
|
|
116
|
+
compute_word_piece_vocabulary as compute_word_piece_vocabulary,
|
|
117
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""DO NOT EDIT.
|
|
2
|
+
|
|
3
|
+
This file was autogenerated. Do not edit it by hand,
|
|
4
|
+
since your modifications would be overwritten.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from keras_hub.src.utils.coco.coco_utils import (
|
|
8
|
+
coco_id_to_name as coco_id_to_name,
|
|
9
|
+
)
|
|
10
|
+
from keras_hub.src.utils.coco.coco_utils import (
|
|
11
|
+
coco_name_to_id as coco_name_to_id,
|
|
12
|
+
)
|
|
13
|
+
from keras_hub.src.utils.imagenet.imagenet_utils import (
|
|
14
|
+
decode_imagenet_predictions as decode_imagenet_predictions,
|
|
15
|
+
)
|
|
16
|
+
from keras_hub.src.utils.imagenet.imagenet_utils import (
|
|
17
|
+
imagenet_id_to_name as imagenet_id_to_name,
|
|
18
|
+
)
|
|
19
|
+
from keras_hub.src.utils.imagenet.imagenet_utils import (
|
|
20
|
+
imagenet_name_to_id as imagenet_name_to_id,
|
|
21
|
+
)
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: keras-hub
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
3
|
+
Version: 0.21.0.dev1
|
|
4
|
+
Summary: Pretrained models for Keras.
|
|
5
|
+
Author-email: Keras team <keras-users@googlegroups.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Home, https://keras.io/keras_hub/
|
|
8
|
+
Project-URL: Repository, https://github.com/keras-team/keras/keras_hub
|
|
9
9
|
Classifier: Development Status :: 3 - Alpha
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -28,20 +28,6 @@ Requires-Dist: regex
|
|
|
28
28
|
Requires-Dist: rich
|
|
29
29
|
Requires-Dist: kagglehub
|
|
30
30
|
Requires-Dist: tensorflow-text; platform_system != "Windows"
|
|
31
|
-
Provides-Extra: extras
|
|
32
|
-
Requires-Dist: rouge-score; extra == "extras"
|
|
33
|
-
Requires-Dist: sentencepiece; extra == "extras"
|
|
34
|
-
Dynamic: author
|
|
35
|
-
Dynamic: author-email
|
|
36
|
-
Dynamic: classifier
|
|
37
|
-
Dynamic: description
|
|
38
|
-
Dynamic: description-content-type
|
|
39
|
-
Dynamic: home-page
|
|
40
|
-
Dynamic: license
|
|
41
|
-
Dynamic: provides-extra
|
|
42
|
-
Dynamic: requires-dist
|
|
43
|
-
Dynamic: requires-python
|
|
44
|
-
Dynamic: summary
|
|
45
31
|
|
|
46
32
|
# KerasHub: Multi-framework Pretrained Models
|
|
47
33
|
[](https://github.com/keras-team/keras-hub/actions?query=workflow%3ATests+branch%3Amaster)
|