keras-hub 0.20.0.dev1__py3-none-any.whl → 0.21.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. keras_hub/__init__.py +15 -33
  2. keras_hub/layers/__init__.py +134 -0
  3. keras_hub/metrics/__init__.py +11 -0
  4. keras_hub/models/__init__.py +642 -0
  5. keras_hub/samplers/__init__.py +18 -0
  6. keras_hub/src/layers/modeling/reversible_embedding.py +25 -35
  7. keras_hub/src/layers/preprocessing/image_converter.py +1 -0
  8. keras_hub/src/layers/preprocessing/random_deletion.py +1 -1
  9. keras_hub/src/layers/preprocessing/random_swap.py +1 -1
  10. keras_hub/src/models/audio_to_text.py +66 -0
  11. keras_hub/src/models/audio_to_text_preprocessor.py +80 -0
  12. keras_hub/src/models/backbone.py +5 -2
  13. keras_hub/src/models/cspnet/cspnet_backbone.py +51 -26
  14. keras_hub/src/models/cspnet/cspnet_presets.py +38 -3
  15. keras_hub/src/models/falcon/falcon_backbone.py +1 -1
  16. keras_hub/src/models/gemma/gemma_presets.py +10 -10
  17. keras_hub/src/models/gemma3/gemma3_causal_lm_preprocessor.py +3 -2
  18. keras_hub/src/models/gemma3/gemma3_presets.py +8 -8
  19. keras_hub/src/models/gemma3/gemma3_vision_encoder.py +1 -1
  20. keras_hub/src/models/llama/llama_attention.py +24 -6
  21. keras_hub/src/models/llama/llama_backbone.py +50 -16
  22. keras_hub/src/models/llama/llama_decoder.py +20 -3
  23. keras_hub/src/models/llama/llama_presets.py +3 -3
  24. keras_hub/src/models/llama/llama_rotary_embedding.py +180 -0
  25. keras_hub/src/models/llama3/llama3_backbone.py +10 -2
  26. keras_hub/src/models/llama3/llama3_presets.py +84 -2
  27. keras_hub/src/models/mistral/mistral_presets.py +3 -3
  28. keras_hub/src/models/mixtral/__init__.py +5 -0
  29. keras_hub/src/models/mixtral/mixtral_attention.py +252 -0
  30. keras_hub/src/models/mixtral/mixtral_backbone.py +207 -0
  31. keras_hub/src/models/mixtral/mixtral_causal_lm.py +281 -0
  32. keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py +76 -0
  33. keras_hub/src/models/mixtral/mixtral_decoder.py +494 -0
  34. keras_hub/src/models/mixtral/mixtral_layer_norm.py +34 -0
  35. keras_hub/src/models/mixtral/mixtral_presets.py +26 -0
  36. keras_hub/src/models/mixtral/mixtral_tokenizer.py +21 -0
  37. keras_hub/src/models/moonshine/__init__.py +5 -0
  38. keras_hub/src/models/moonshine/moonshine_audio_converter.py +301 -0
  39. keras_hub/src/models/moonshine/moonshine_audio_to_text.py +383 -0
  40. keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py +272 -0
  41. keras_hub/src/models/moonshine/moonshine_backbone.py +478 -0
  42. keras_hub/src/models/moonshine/moonshine_decoder.py +313 -0
  43. keras_hub/src/models/moonshine/moonshine_encoder.py +212 -0
  44. keras_hub/src/models/moonshine/moonshine_layers.py +239 -0
  45. keras_hub/src/models/moonshine/moonshine_multi_head_attention.py +355 -0
  46. keras_hub/src/models/moonshine/moonshine_presets.py +25 -0
  47. keras_hub/src/models/moonshine/moonshine_tokenizer.py +62 -0
  48. keras_hub/src/models/pali_gemma/pali_gemma_presets.py +11 -11
  49. keras_hub/src/models/pali_gemma/pali_gemma_vit.py +1 -1
  50. keras_hub/src/models/qwen/__init__.py +4 -0
  51. keras_hub/src/models/qwen/qwen_attention.py +3 -1
  52. keras_hub/src/models/qwen/qwen_backbone.py +8 -1
  53. keras_hub/src/models/qwen/qwen_causal_lm.py +7 -0
  54. keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py +7 -0
  55. keras_hub/src/models/qwen/qwen_presets.py +61 -0
  56. keras_hub/src/models/qwen/qwen_tokenizer.py +9 -0
  57. keras_hub/src/models/qwen_moe/__init__.py +5 -0
  58. keras_hub/src/models/qwen_moe/qwen_moe_attention.py +375 -0
  59. keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +373 -0
  60. keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py +350 -0
  61. keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py +17 -0
  62. keras_hub/src/models/qwen_moe/qwen_moe_decoder.py +625 -0
  63. keras_hub/src/models/qwen_moe/qwen_moe_layernorm.py +32 -0
  64. keras_hub/src/models/qwen_moe/qwen_moe_presets.py +15 -0
  65. keras_hub/src/models/qwen_moe/qwen_moe_tokenizer.py +46 -0
  66. keras_hub/src/models/retinanet/retinanet_image_converter.py +0 -13
  67. keras_hub/src/models/retinanet/retinanet_presets.py +2 -2
  68. keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +0 -18
  69. keras_hub/src/models/segformer/segformer_presets.py +12 -12
  70. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +6 -0
  71. keras_hub/src/models/task.py +5 -2
  72. keras_hub/src/models/xception/__init__.py +5 -0
  73. keras_hub/src/models/xception/xception_backbone.py +188 -0
  74. keras_hub/src/models/xception/xception_image_classifier.py +12 -0
  75. keras_hub/src/models/xception/xception_image_classifier_preprocessor.py +14 -0
  76. keras_hub/src/models/xception/xception_image_converter.py +8 -0
  77. keras_hub/src/models/xception/xception_presets.py +14 -0
  78. keras_hub/src/tests/mocks/mock_gemma3_tokenizer.py +155 -0
  79. keras_hub/src/utils/coco/__init__.py +0 -0
  80. keras_hub/src/utils/coco/coco_utils.py +133 -0
  81. keras_hub/src/utils/imagenet/imagenet_utils.py +36 -0
  82. keras_hub/src/utils/keras_utils.py +11 -0
  83. keras_hub/src/utils/preset_utils.py +70 -10
  84. keras_hub/src/utils/tensor_utils.py +27 -1
  85. keras_hub/src/utils/timm/convert_cspnet.py +94 -23
  86. keras_hub/src/utils/timm/preset_loader.py +6 -6
  87. keras_hub/src/utils/transformers/convert_llama3.py +21 -1
  88. keras_hub/src/utils/transformers/convert_mixtral.py +139 -0
  89. keras_hub/src/utils/transformers/convert_qwen.py +1 -0
  90. keras_hub/src/utils/transformers/convert_qwen_moe.py +253 -0
  91. keras_hub/src/utils/transformers/preset_loader.py +6 -0
  92. keras_hub/src/{version_utils.py → version.py} +1 -1
  93. keras_hub/tokenizers/__init__.py +117 -0
  94. keras_hub/utils/__init__.py +21 -0
  95. {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/METADATA +6 -20
  96. {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/RECORD +98 -55
  97. {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/WHEEL +1 -1
  98. keras_hub/api/__init__.py +0 -15
  99. keras_hub/api/layers/__init__.py +0 -86
  100. keras_hub/api/metrics/__init__.py +0 -11
  101. keras_hub/api/models/__init__.py +0 -416
  102. keras_hub/api/samplers/__init__.py +0 -16
  103. keras_hub/api/tokenizers/__init__.py +0 -58
  104. keras_hub/api/utils/__init__.py +0 -9
  105. {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,139 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.mixtral.mixtral_backbone import MixtralBackbone
4
+ from keras_hub.src.utils.preset_utils import get_file
5
+
6
+ backbone_cls = MixtralBackbone
7
+
8
+
9
+ def convert_backbone_config(transformers_config):
10
+ return {
11
+ "vocabulary_size": transformers_config["vocab_size"],
12
+ "num_layers": transformers_config["num_hidden_layers"],
13
+ "num_query_heads": transformers_config["num_attention_heads"],
14
+ "hidden_dim": transformers_config["hidden_size"],
15
+ "intermediate_dim": transformers_config["intermediate_size"],
16
+ "num_key_value_heads": transformers_config["num_key_value_heads"],
17
+ "num_experts": transformers_config["num_local_experts"],
18
+ "top_k": transformers_config["num_experts_per_tok"],
19
+ "rope_max_wavelength": transformers_config["rope_theta"],
20
+ "layer_norm_epsilon": transformers_config["rms_norm_eps"],
21
+ "sliding_window": transformers_config["sliding_window"],
22
+ "output_router_logits": transformers_config["output_router_logits"],
23
+ }
24
+
25
+
26
+ def convert_weights(backbone, loader, transformers_config):
27
+ # Embeddings
28
+ loader.port_weight(
29
+ keras_variable=backbone.get_layer("token_embedding").embeddings,
30
+ hf_weight_key="model.embed_tokens.weight",
31
+ )
32
+ loader.port_weight(
33
+ keras_variable=backbone.get_layer("token_embedding").reverse_embeddings,
34
+ hf_weight_key="lm_head.weight",
35
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
36
+ )
37
+
38
+ def transpose_and_reshape(x, shape):
39
+ return np.reshape(np.transpose(x), shape)
40
+
41
+ for i in range(backbone.num_layers):
42
+ decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
43
+
44
+ # Input layernorm
45
+ loader.port_weight(
46
+ keras_variable=decoder_layer._self_attention_layernorm.scale,
47
+ hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
48
+ )
49
+
50
+ # Attention layers
51
+ ## Query
52
+ loader.port_weight(
53
+ keras_variable=decoder_layer._self_attention_layer.query_dense.kernel,
54
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
55
+ hook_fn=transpose_and_reshape,
56
+ )
57
+ ## Key
58
+ loader.port_weight(
59
+ keras_variable=decoder_layer._self_attention_layer.key_dense.kernel,
60
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
61
+ hook_fn=transpose_and_reshape,
62
+ )
63
+ ## Value
64
+ loader.port_weight(
65
+ keras_variable=decoder_layer._self_attention_layer.value_dense.kernel,
66
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
67
+ hook_fn=transpose_and_reshape,
68
+ )
69
+ ## Output
70
+ loader.port_weight(
71
+ keras_variable=decoder_layer._self_attention_layer.output_dense.kernel,
72
+ hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
73
+ hook_fn=transpose_and_reshape,
74
+ )
75
+
76
+ # MoE layers
77
+ # Router gate
78
+ loader.port_weight(
79
+ keras_variable=decoder_layer._sparse_moe_block._sparse_feedforward_gate_dense.kernel,
80
+ hf_weight_key=f"model.layers.{i}.block_sparse_moe.gate.weight",
81
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
82
+ )
83
+
84
+ # Batched experts: w1 (gate), w3 (intermediate), and w2 (output) weights
85
+ gate_weights_list = []
86
+ intermediate_weights_list = []
87
+ output_weights_list = []
88
+ for expert_idx in range(backbone.num_experts):
89
+ # Load w1 (gate dense) for each expert
90
+ w1 = loader.get_tensor(
91
+ f"model.layers.{i}.block_sparse_moe.experts.{expert_idx}.w1.weight"
92
+ )
93
+ w1_transposed = np.transpose(w1, axes=(1, 0))
94
+ gate_weights_list.append(w1_transposed)
95
+
96
+ w3 = loader.get_tensor(
97
+ f"model.layers.{i}.block_sparse_moe.experts.{expert_idx}.w3.weight"
98
+ )
99
+ w3_transposed = np.transpose(w3, axes=(1, 0))
100
+ intermediate_weights_list.append(w3_transposed)
101
+
102
+ w2 = loader.get_tensor(
103
+ f"model.layers.{i}.block_sparse_moe.experts.{expert_idx}.w2.weight"
104
+ )
105
+ w2_transposed = np.transpose(w2, axes=(1, 0))
106
+ output_weights_list.append(w2_transposed)
107
+
108
+ gate_batched = np.stack(gate_weights_list, axis=0)
109
+ intermediate_batched = np.stack(intermediate_weights_list, axis=0)
110
+ output_batched = np.stack(output_weights_list, axis=0)
111
+
112
+ # Assign batched weights to expert_bank
113
+ decoder_layer._sparse_moe_block.expert_bank._expert_feedforward_gate_dense.assign(
114
+ gate_batched
115
+ )
116
+ decoder_layer._sparse_moe_block.expert_bank._expert_feedforward_intermediate_dense.assign(
117
+ intermediate_batched
118
+ )
119
+ decoder_layer._sparse_moe_block.expert_bank._expert_feedforward_output_dense.assign(
120
+ output_batched
121
+ )
122
+
123
+ # Feedforward layernorm
124
+ loader.port_weight(
125
+ keras_variable=decoder_layer._feedforward_layernorm.scale,
126
+ hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
127
+ )
128
+
129
+ # Final normalization layer
130
+ loader.port_weight(
131
+ keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
132
+ hf_weight_key="model.norm.weight",
133
+ )
134
+
135
+ return backbone
136
+
137
+
138
+ def convert_tokenizer(cls, preset, **kwargs):
139
+ return cls(get_file(preset, "tokenizer.model"), **kwargs)
@@ -18,6 +18,7 @@ def convert_backbone_config(transformers_config):
18
18
  "rope_max_wavelength": transformers_config["rope_theta"],
19
19
  "use_sliding_window": transformers_config["use_sliding_window"],
20
20
  "sliding_window_size": transformers_config["sliding_window"],
21
+ "tie_word_embeddings": transformers_config["tie_word_embeddings"],
21
22
  }
22
23
 
23
24
 
@@ -0,0 +1,253 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.qwen_moe.qwen_moe_backbone import QwenMoeBackbone
4
+ from keras_hub.src.utils.preset_utils import load_json
5
+
6
+ backbone_cls = QwenMoeBackbone
7
+
8
+
9
+ def convert_backbone_config(transformers_config):
10
+ return {
11
+ "vocabulary_size": transformers_config["vocab_size"],
12
+ "hidden_dim": transformers_config["hidden_size"],
13
+ "num_layers": transformers_config["num_hidden_layers"],
14
+ "num_query_heads": transformers_config["num_attention_heads"],
15
+ "num_key_value_heads": transformers_config["num_key_value_heads"],
16
+ "intermediate_dim": transformers_config["intermediate_size"],
17
+ "moe_intermediate_dim": transformers_config["moe_intermediate_size"],
18
+ "shared_expert_intermediate_dim": transformers_config[
19
+ "shared_expert_intermediate_size"
20
+ ],
21
+ "num_experts": transformers_config["num_experts"],
22
+ "top_k": transformers_config["num_experts_per_tok"],
23
+ "norm_top_k_prob": transformers_config["norm_topk_prob"],
24
+ "decoder_sparse_step": transformers_config["decoder_sparse_step"],
25
+ "layer_norm_epsilon": transformers_config["rms_norm_eps"],
26
+ "rope_max_wavelength": transformers_config["rope_theta"],
27
+ "use_sliding_window": transformers_config["use_sliding_window"],
28
+ "sliding_window_size": transformers_config["sliding_window"],
29
+ "output_router_logits": transformers_config["output_router_logits"],
30
+ "router_aux_loss_coefficient": transformers_config[
31
+ "router_aux_loss_coef"
32
+ ],
33
+ }
34
+
35
+
36
+ def convert_weights(backbone, loader, transformers_config):
37
+ loader.port_weight(
38
+ keras_variable=backbone.get_layer("token_embedding").embeddings,
39
+ hf_weight_key="model.embed_tokens.weight",
40
+ )
41
+ if not backbone.tie_word_embeddings:
42
+ loader.port_weight(
43
+ keras_variable=backbone.get_layer(
44
+ "token_embedding"
45
+ ).reverse_embeddings,
46
+ hf_weight_key="lm_head.weight",
47
+ # rearrange_pattern="b a -> a b",
48
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
49
+ )
50
+
51
+ def transpose_and_reshape(x, shape):
52
+ return np.reshape(np.transpose(x), shape)
53
+
54
+ for i in range(backbone.num_layers):
55
+ decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
56
+
57
+ # Input layernorm
58
+ loader.port_weight(
59
+ keras_variable=decoder_layer._self_attention_layernorm.scale,
60
+ hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
61
+ )
62
+
63
+ # Attention layers
64
+
65
+ ## Query
66
+ loader.port_weight(
67
+ keras_variable=decoder_layer._self_attention_layer.query_dense.kernel,
68
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
69
+ hook_fn=transpose_and_reshape,
70
+ )
71
+ loader.port_weight(
72
+ keras_variable=decoder_layer._self_attention_layer.query_dense.bias,
73
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.bias",
74
+ hook_fn=transpose_and_reshape,
75
+ )
76
+ ## Key
77
+ loader.port_weight(
78
+ keras_variable=decoder_layer._self_attention_layer.key_dense.kernel,
79
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
80
+ hook_fn=transpose_and_reshape,
81
+ )
82
+ loader.port_weight(
83
+ keras_variable=decoder_layer._self_attention_layer.key_dense.bias,
84
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.bias",
85
+ hook_fn=transpose_and_reshape,
86
+ )
87
+ ## Value
88
+ loader.port_weight(
89
+ keras_variable=decoder_layer._self_attention_layer.value_dense.kernel,
90
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
91
+ hook_fn=transpose_and_reshape,
92
+ )
93
+ loader.port_weight(
94
+ keras_variable=decoder_layer._self_attention_layer.value_dense.bias,
95
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.bias",
96
+ hook_fn=transpose_and_reshape,
97
+ )
98
+ ## Output
99
+ loader.port_weight(
100
+ keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
101
+ hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
102
+ # rearrange_patterns="c (a b) -> a b c",
103
+ # rearrange_dims={"a": backbone.num_query_heads},
104
+ hook_fn=transpose_and_reshape,
105
+ )
106
+
107
+ # MLP layers
108
+ if (
109
+ (i not in backbone.mlp_only_layers)
110
+ and backbone.num_experts > 0
111
+ and ((i + 1) % backbone.decoder_sparse_step == 0)
112
+ ):
113
+ # MoE layers
114
+ loader.port_weight(
115
+ keras_variable=decoder_layer.mlp._sparse_feedforward_gate_dense.kernel,
116
+ hf_weight_key=f"model.layers.{i}.mlp.gate.weight",
117
+ # rearrange_patterns="b a -> a b",
118
+ hook_fn=lambda hf_tensor, _: np.transpose(
119
+ hf_tensor, axes=(1, 0)
120
+ ),
121
+ )
122
+ # Batched experts: gate_up_proj and down_proj
123
+ gate_up_proj_list = []
124
+ down_proj_list = []
125
+ for expert_idx in range(backbone.num_experts):
126
+ # Load gate_proj and up_proj for each expert
127
+ gate_proj = loader.get_tensor(
128
+ f"model.layers.{i}.mlp.experts.{expert_idx}.gate_proj.weight"
129
+ )
130
+ up_proj = loader.get_tensor(
131
+ f"model.layers.{i}.mlp.experts.{expert_idx}.up_proj.weight"
132
+ )
133
+ # Transpose to (hidden_dim, intermediate_dim)
134
+ gate_proj = np.transpose(gate_proj, axes=(1, 0))
135
+ up_proj = np.transpose(up_proj, axes=(1, 0))
136
+ # Concatenate gate_proj and up_proj along the last dimension
137
+ gate_up_proj = np.concatenate([gate_proj, up_proj], axis=-1)
138
+ gate_up_proj_list.append(gate_up_proj)
139
+
140
+ # Load down_proj for each expert
141
+ down_proj = loader.get_tensor(
142
+ f"model.layers.{i}.mlp.experts.{expert_idx}.down_proj.weight"
143
+ )
144
+ down_proj = np.transpose(
145
+ down_proj, axes=(1, 0)
146
+ ) # (intermediate_dim, hidden_dim)
147
+ down_proj_list.append(down_proj)
148
+
149
+ # Stack the lists to create batched weights
150
+ gate_up_proj_batched = np.stack(
151
+ gate_up_proj_list, axis=0
152
+ ) # (num_experts, hidden_dim, 2 * intermediate_dim)
153
+ down_proj_batched = np.stack(
154
+ down_proj_list, axis=0
155
+ ) # (num_experts, intermediate_dim, hidden_dim)
156
+
157
+ # Assign batched weights to expert_bank
158
+ decoder_layer.mlp.expert_bank._expert_feedforward_gate_dense.assign(
159
+ gate_up_proj_batched
160
+ )
161
+ decoder_layer.mlp.expert_bank._expert_feedforward_output_dense.assign(
162
+ down_proj_batched
163
+ )
164
+
165
+ loader.port_weight(
166
+ keras_variable=decoder_layer.mlp.shared_expert_dense._feedforward_intermediate_dense.kernel,
167
+ hf_weight_key=f"model.layers.{i}.mlp.shared_expert.up_proj.weight",
168
+ hook_fn=lambda hf_tensor, _: np.transpose(
169
+ hf_tensor, axes=(1, 0)
170
+ ),
171
+ )
172
+ loader.port_weight(
173
+ keras_variable=decoder_layer.mlp.shared_expert_dense._feedforward_output_dense.kernel,
174
+ hf_weight_key=f"model.layers.{i}.mlp.shared_expert.down_proj.weight",
175
+ hook_fn=lambda hf_tensor, _: np.transpose(
176
+ hf_tensor, axes=(1, 0)
177
+ ),
178
+ )
179
+ loader.port_weight(
180
+ keras_variable=decoder_layer.mlp.shared_expert_dense._feedforward_gate_dense.kernel,
181
+ hf_weight_key=f"model.layers.{i}.mlp.shared_expert.gate_proj.weight",
182
+ hook_fn=lambda hf_tensor, _: np.transpose(
183
+ hf_tensor, axes=(1, 0)
184
+ ),
185
+ )
186
+
187
+ loader.port_weight(
188
+ keras_variable=decoder_layer.mlp.shared_expert_gate_dense.kernel,
189
+ hf_weight_key=f"model.layers.{i}.mlp.shared_expert_gate.weight",
190
+ hook_fn=lambda hf_tensor, _: np.transpose(
191
+ hf_tensor, axes=(1, 0)
192
+ ),
193
+ )
194
+ else:
195
+ loader.port_weight(
196
+ keras_variable=decoder_layer._feedforward_intermediate_dense.kernel,
197
+ hf_weight_key=f"model.layers.{i}.mlp.up_proj.weight",
198
+ # rearrange_patterns="b a -> a b",
199
+ hook_fn=lambda hf_tensor, _: np.transpose(
200
+ hf_tensor, axes=(1, 0)
201
+ ),
202
+ )
203
+ loader.port_weight(
204
+ keras_variable=decoder_layer._feedforward_output_dense.kernel,
205
+ hf_weight_key=f"model.layers.{i}.mlp.down_proj.weight",
206
+ # rearrange_patterns="b a -> a b",
207
+ hook_fn=lambda hf_tensor, _: np.transpose(
208
+ hf_tensor, axes=(1, 0)
209
+ ),
210
+ )
211
+ loader.port_weight(
212
+ keras_variable=decoder_layer._feedforward_gate_dense.kernel,
213
+ hf_weight_key=f"model.layers.{i}.mlp.gate_proj.weight",
214
+ # rearrange_patterns="b a -> a b",
215
+ hook_fn=lambda hf_tensor, _: np.transpose(
216
+ hf_tensor, axes=(1, 0)
217
+ ),
218
+ )
219
+
220
+ # Feedforward layernorm
221
+ loader.port_weight(
222
+ keras_variable=decoder_layer._feedforward_layernorm.scale,
223
+ hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
224
+ )
225
+
226
+ # Final normalization layer
227
+ loader.port_weight(
228
+ keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
229
+ hf_weight_key="model.norm.weight",
230
+ )
231
+
232
+ return backbone
233
+
234
+
235
+ def convert_tokenizer(cls, preset, **kwargs):
236
+ tokenizer_config = load_json(preset, "tokenizer.json")
237
+ vocab = tokenizer_config["model"]["vocab"]
238
+ merges = tokenizer_config["model"]["merges"]
239
+
240
+ # Load all special tokens with the exception of "reserved" ones.
241
+ special_tokens = set()
242
+ for token in tokenizer_config["added_tokens"]:
243
+ if not token["content"].startswith("<|reserved_special_token_"):
244
+ vocab[token["content"]] = token["id"]
245
+ special_tokens.add(token["content"])
246
+
247
+ kwargs.update(
248
+ {
249
+ "unsplittable_tokens": list(special_tokens),
250
+ }
251
+ )
252
+
253
+ return cls(vocabulary=vocab, merges=merges, **kwargs)
@@ -11,8 +11,10 @@ from keras_hub.src.utils.transformers import convert_gemma
11
11
  from keras_hub.src.utils.transformers import convert_gpt2
12
12
  from keras_hub.src.utils.transformers import convert_llama3
13
13
  from keras_hub.src.utils.transformers import convert_mistral
14
+ from keras_hub.src.utils.transformers import convert_mixtral
14
15
  from keras_hub.src.utils.transformers import convert_pali_gemma
15
16
  from keras_hub.src.utils.transformers import convert_qwen
17
+ from keras_hub.src.utils.transformers import convert_qwen_moe
16
18
  from keras_hub.src.utils.transformers import convert_vit
17
19
  from keras_hub.src.utils.transformers.safetensor_utils import SafetensorLoader
18
20
 
@@ -44,6 +46,10 @@ class TransformersPresetLoader(PresetLoader):
44
46
  self.converter = convert_vit
45
47
  elif model_type == "qwen2":
46
48
  self.converter = convert_qwen
49
+ elif model_type == "mixtral":
50
+ self.converter = convert_mixtral
51
+ elif model_type == "qwen2_moe":
52
+ self.converter = convert_qwen_moe
47
53
  else:
48
54
  raise ValueError(
49
55
  "KerasHub has no converter for huggingface/transformers models "
@@ -1,7 +1,7 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
2
 
3
3
  # Unique source of truth for the version number.
4
- __version__ = "0.20.0.dev1"
4
+ __version__ = "0.21.0.dev1"
5
5
 
6
6
 
7
7
  @keras_hub_export("keras_hub.version")
@@ -0,0 +1,117 @@
1
+ """DO NOT EDIT.
2
+
3
+ This file was autogenerated. Do not edit it by hand,
4
+ since your modifications would be overwritten.
5
+ """
6
+
7
+ from keras_hub.src.models.albert.albert_tokenizer import (
8
+ AlbertTokenizer as AlbertTokenizer,
9
+ )
10
+ from keras_hub.src.models.bart.bart_tokenizer import (
11
+ BartTokenizer as BartTokenizer,
12
+ )
13
+ from keras_hub.src.models.bert.bert_tokenizer import (
14
+ BertTokenizer as BertTokenizer,
15
+ )
16
+ from keras_hub.src.models.bloom.bloom_tokenizer import (
17
+ BloomTokenizer as BloomTokenizer,
18
+ )
19
+ from keras_hub.src.models.clip.clip_tokenizer import (
20
+ CLIPTokenizer as CLIPTokenizer,
21
+ )
22
+ from keras_hub.src.models.deberta_v3.deberta_v3_tokenizer import (
23
+ DebertaV3Tokenizer as DebertaV3Tokenizer,
24
+ )
25
+ from keras_hub.src.models.distil_bert.distil_bert_tokenizer import (
26
+ DistilBertTokenizer as DistilBertTokenizer,
27
+ )
28
+ from keras_hub.src.models.electra.electra_tokenizer import (
29
+ ElectraTokenizer as ElectraTokenizer,
30
+ )
31
+ from keras_hub.src.models.f_net.f_net_tokenizer import (
32
+ FNetTokenizer as FNetTokenizer,
33
+ )
34
+ from keras_hub.src.models.falcon.falcon_tokenizer import (
35
+ FalconTokenizer as FalconTokenizer,
36
+ )
37
+ from keras_hub.src.models.gemma.gemma_tokenizer import (
38
+ GemmaTokenizer as GemmaTokenizer,
39
+ )
40
+ from keras_hub.src.models.gemma3.gemma3_tokenizer import (
41
+ Gemma3Tokenizer as Gemma3Tokenizer,
42
+ )
43
+ from keras_hub.src.models.gpt2.gpt2_tokenizer import (
44
+ GPT2Tokenizer as GPT2Tokenizer,
45
+ )
46
+ from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
47
+ GPTNeoXTokenizer as GPTNeoXTokenizer,
48
+ )
49
+ from keras_hub.src.models.llama.llama_tokenizer import (
50
+ LlamaTokenizer as LlamaTokenizer,
51
+ )
52
+ from keras_hub.src.models.llama3.llama3_tokenizer import (
53
+ Llama3Tokenizer as Llama3Tokenizer,
54
+ )
55
+ from keras_hub.src.models.mistral.mistral_tokenizer import (
56
+ MistralTokenizer as MistralTokenizer,
57
+ )
58
+ from keras_hub.src.models.mixtral.mixtral_tokenizer import (
59
+ MixtralTokenizer as MixtralTokenizer,
60
+ )
61
+ from keras_hub.src.models.moonshine.moonshine_tokenizer import (
62
+ MoonshineTokenizer as MoonshineTokenizer,
63
+ )
64
+ from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer as OPTTokenizer
65
+ from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
66
+ PaliGemmaTokenizer as PaliGemmaTokenizer,
67
+ )
68
+ from keras_hub.src.models.phi3.phi3_tokenizer import (
69
+ Phi3Tokenizer as Phi3Tokenizer,
70
+ )
71
+ from keras_hub.src.models.qwen.qwen_tokenizer import (
72
+ QwenTokenizer as Qwen2Tokenizer,
73
+ )
74
+ from keras_hub.src.models.qwen.qwen_tokenizer import (
75
+ QwenTokenizer as QwenTokenizer,
76
+ )
77
+ from keras_hub.src.models.qwen_moe.qwen_moe_tokenizer import (
78
+ QwenMoeTokenizer as QwenMoeTokenizer,
79
+ )
80
+ from keras_hub.src.models.roberta.roberta_tokenizer import (
81
+ RobertaTokenizer as RobertaTokenizer,
82
+ )
83
+ from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
84
+ RoformerV2Tokenizer as RoformerV2Tokenizer,
85
+ )
86
+ from keras_hub.src.models.siglip.siglip_tokenizer import (
87
+ SigLIPTokenizer as SigLIPTokenizer,
88
+ )
89
+ from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer as T5Tokenizer
90
+ from keras_hub.src.models.whisper.whisper_tokenizer import (
91
+ WhisperTokenizer as WhisperTokenizer,
92
+ )
93
+ from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
94
+ XLMRobertaTokenizer as XLMRobertaTokenizer,
95
+ )
96
+ from keras_hub.src.tokenizers.byte_pair_tokenizer import (
97
+ BytePairTokenizer as BytePairTokenizer,
98
+ )
99
+ from keras_hub.src.tokenizers.byte_tokenizer import (
100
+ ByteTokenizer as ByteTokenizer,
101
+ )
102
+ from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
103
+ SentencePieceTokenizer as SentencePieceTokenizer,
104
+ )
105
+ from keras_hub.src.tokenizers.sentence_piece_tokenizer_trainer import (
106
+ compute_sentence_piece_proto as compute_sentence_piece_proto,
107
+ )
108
+ from keras_hub.src.tokenizers.tokenizer import Tokenizer as Tokenizer
109
+ from keras_hub.src.tokenizers.unicode_codepoint_tokenizer import (
110
+ UnicodeCodepointTokenizer as UnicodeCodepointTokenizer,
111
+ )
112
+ from keras_hub.src.tokenizers.word_piece_tokenizer import (
113
+ WordPieceTokenizer as WordPieceTokenizer,
114
+ )
115
+ from keras_hub.src.tokenizers.word_piece_tokenizer_trainer import (
116
+ compute_word_piece_vocabulary as compute_word_piece_vocabulary,
117
+ )
@@ -0,0 +1,21 @@
1
+ """DO NOT EDIT.
2
+
3
+ This file was autogenerated. Do not edit it by hand,
4
+ since your modifications would be overwritten.
5
+ """
6
+
7
+ from keras_hub.src.utils.coco.coco_utils import (
8
+ coco_id_to_name as coco_id_to_name,
9
+ )
10
+ from keras_hub.src.utils.coco.coco_utils import (
11
+ coco_name_to_id as coco_name_to_id,
12
+ )
13
+ from keras_hub.src.utils.imagenet.imagenet_utils import (
14
+ decode_imagenet_predictions as decode_imagenet_predictions,
15
+ )
16
+ from keras_hub.src.utils.imagenet.imagenet_utils import (
17
+ imagenet_id_to_name as imagenet_id_to_name,
18
+ )
19
+ from keras_hub.src.utils.imagenet.imagenet_utils import (
20
+ imagenet_name_to_id as imagenet_name_to_id,
21
+ )
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: keras-hub
3
- Version: 0.20.0.dev1
4
- Summary: Industry-strength Natural Language Processing extensions for Keras.
5
- Home-page: https://github.com/keras-team/keras-hub
6
- Author: Keras team
7
- Author-email: keras-hub@google.com
8
- License: Apache License 2.0
3
+ Version: 0.21.0.dev1
4
+ Summary: Pretrained models for Keras.
5
+ Author-email: Keras team <keras-users@googlegroups.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Home, https://keras.io/keras_hub/
8
+ Project-URL: Repository, https://github.com/keras-team/keras/keras_hub
9
9
  Classifier: Development Status :: 3 - Alpha
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.9
@@ -28,20 +28,6 @@ Requires-Dist: regex
28
28
  Requires-Dist: rich
29
29
  Requires-Dist: kagglehub
30
30
  Requires-Dist: tensorflow-text; platform_system != "Windows"
31
- Provides-Extra: extras
32
- Requires-Dist: rouge-score; extra == "extras"
33
- Requires-Dist: sentencepiece; extra == "extras"
34
- Dynamic: author
35
- Dynamic: author-email
36
- Dynamic: classifier
37
- Dynamic: description
38
- Dynamic: description-content-type
39
- Dynamic: home-page
40
- Dynamic: license
41
- Dynamic: provides-extra
42
- Dynamic: requires-dist
43
- Dynamic: requires-python
44
- Dynamic: summary
45
31
 
46
32
  # KerasHub: Multi-framework Pretrained Models
47
33
  [![](https://github.com/keras-team/keras-hub/workflows/Tests/badge.svg?branch=master)](https://github.com/keras-team/keras-hub/actions?query=workflow%3ATests+branch%3Amaster)