keras-hub 0.21.1__py3-none-any.whl → 0.22.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. keras_hub/layers/__init__.py +9 -0
  2. keras_hub/models/__init__.py +47 -0
  3. keras_hub/src/layers/modeling/transformer_encoder.py +6 -3
  4. keras_hub/src/layers/preprocessing/multi_segment_packer.py +17 -3
  5. keras_hub/src/layers/preprocessing/start_end_packer.py +24 -6
  6. keras_hub/src/models/backbone.py +13 -10
  7. keras_hub/src/models/clip/clip_backbone.py +3 -102
  8. keras_hub/src/models/clip/clip_layers.py +295 -0
  9. keras_hub/src/models/clip/clip_preprocessor.py +57 -48
  10. keras_hub/src/models/clip/clip_text_encoder.py +2 -2
  11. keras_hub/src/models/clip/clip_vision_encoder.py +3 -3
  12. keras_hub/src/models/deit/__init__.py +5 -0
  13. keras_hub/src/models/deit/deit_backbone.py +154 -0
  14. keras_hub/src/models/deit/deit_image_classifier.py +171 -0
  15. keras_hub/src/models/deit/deit_image_classifier_preprocessor.py +12 -0
  16. keras_hub/src/models/deit/deit_image_converter.py +8 -0
  17. keras_hub/src/models/deit/deit_layers.py +519 -0
  18. keras_hub/src/models/deit/deit_presets.py +49 -0
  19. keras_hub/src/models/dinov2/__init__.py +5 -0
  20. keras_hub/src/models/dinov2/dinov2_backbone.py +228 -0
  21. keras_hub/src/models/dinov2/dinov2_image_converter.py +8 -0
  22. keras_hub/src/models/dinov2/dinov2_layers.py +886 -0
  23. keras_hub/src/models/dinov2/dinov2_presets.py +89 -0
  24. keras_hub/src/models/esm/__init__.py +5 -0
  25. keras_hub/src/models/esm/esm_attention.py +95 -0
  26. keras_hub/src/models/esm/esm_backbone.py +229 -0
  27. keras_hub/src/models/esm/esm_classifier.py +184 -0
  28. keras_hub/src/models/esm/esm_classifier_preprocessor.py +135 -0
  29. keras_hub/src/models/esm/esm_encoder.py +134 -0
  30. keras_hub/src/models/esm/esm_masked_plm.py +117 -0
  31. keras_hub/src/models/esm/esm_masked_plm_preprocessor.py +143 -0
  32. keras_hub/src/models/esm/esm_presets.py +53 -0
  33. keras_hub/src/models/esm/esm_tokenizer.py +82 -0
  34. keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +6 -2
  35. keras_hub/src/models/gemma/gemma_attention.py +1 -1
  36. keras_hub/src/models/gemma3/gemma3_backbone.py +2 -2
  37. keras_hub/src/models/gemma3/gemma3_interleave_embeddings.py +1 -1
  38. keras_hub/src/models/hgnetv2/__init__.py +5 -0
  39. keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +193 -0
  40. keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +148 -0
  41. keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py +216 -0
  42. keras_hub/src/models/hgnetv2/hgnetv2_image_classifier_preprocessor.py +14 -0
  43. keras_hub/src/models/hgnetv2/hgnetv2_image_converter.py +8 -0
  44. keras_hub/src/models/hgnetv2/hgnetv2_layers.py +918 -0
  45. keras_hub/src/models/hgnetv2/hgnetv2_presets.py +58 -0
  46. keras_hub/src/models/llama3/llama3_presets.py +3 -3
  47. keras_hub/src/models/mistral/mistral_presets.py +17 -1
  48. keras_hub/src/models/mixtral/mixtral_presets.py +2 -2
  49. keras_hub/src/models/mobilenet/mobilenet_presets.py +4 -4
  50. keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +2 -2
  51. keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +2 -2
  52. keras_hub/src/models/pali_gemma/pali_gemma_presets.py +17 -17
  53. keras_hub/src/models/qwen3/__init__.py +5 -0
  54. keras_hub/src/models/qwen3/qwen3_attention.py +369 -0
  55. keras_hub/src/models/qwen3/qwen3_backbone.py +191 -0
  56. keras_hub/src/models/qwen3/qwen3_causal_lm.py +390 -0
  57. keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py +10 -0
  58. keras_hub/src/models/qwen3/qwen3_decoder.py +309 -0
  59. keras_hub/src/models/qwen3/qwen3_layernorm.py +38 -0
  60. keras_hub/src/models/qwen3/qwen3_presets.py +73 -0
  61. keras_hub/src/models/qwen3/qwen3_tokenizer.py +48 -0
  62. keras_hub/src/models/qwen_moe/qwen_moe_attention.py +1 -0
  63. keras_hub/src/models/qwen_moe/qwen_moe_presets.py +2 -2
  64. keras_hub/src/models/roformer_v2/roformer_v2_attention.py +0 -2
  65. keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +16 -7
  66. keras_hub/src/models/stable_diffusion_3/mmdit.py +61 -4
  67. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +31 -32
  68. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +1 -0
  69. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +1 -0
  70. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +1 -0
  71. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +6 -2
  72. keras_hub/src/models/vit/vit_backbone.py +31 -11
  73. keras_hub/src/models/vit/vit_image_converter.py +0 -70
  74. keras_hub/src/models/vit/vit_layers.py +33 -18
  75. keras_hub/src/models/vit/vit_presets.py +11 -11
  76. keras_hub/src/utils/keras_utils.py +17 -0
  77. keras_hub/src/utils/preset_utils.py +19 -4
  78. keras_hub/src/utils/tensor_utils.py +14 -0
  79. keras_hub/src/utils/transformers/convert_deit.py +155 -0
  80. keras_hub/src/utils/transformers/convert_dinov2.py +180 -0
  81. keras_hub/src/utils/transformers/convert_esm.py +159 -0
  82. keras_hub/src/utils/transformers/convert_llama3.py +6 -0
  83. keras_hub/src/utils/transformers/convert_qwen3.py +145 -0
  84. keras_hub/src/utils/transformers/export/gemma.py +89 -0
  85. keras_hub/src/utils/transformers/export/hf_exporter.py +98 -0
  86. keras_hub/src/utils/transformers/preset_loader.py +14 -2
  87. keras_hub/src/version.py +1 -1
  88. keras_hub/tokenizers/__init__.py +1 -0
  89. {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/METADATA +4 -4
  90. {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/RECORD +92 -48
  91. keras_hub/src/models/clip/clip_encoder_block.py +0 -111
  92. keras_hub/src/models/clip/clip_vision_embedding.py +0 -101
  93. {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/WHEEL +0 -0
  94. {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,155 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.deit.deit_backbone import DeiTBackbone
4
+
5
+ backbone_cls = DeiTBackbone
6
+
7
+
8
+ def convert_backbone_config(transformers_config):
9
+ image_size = transformers_config["image_size"]
10
+ return {
11
+ "image_shape": (image_size, image_size, 3),
12
+ "patch_size": transformers_config["patch_size"],
13
+ "num_layers": transformers_config["num_hidden_layers"],
14
+ "num_heads": transformers_config["num_attention_heads"],
15
+ "hidden_dim": transformers_config["hidden_size"],
16
+ "intermediate_dim": transformers_config["intermediate_size"],
17
+ "dropout_rate": transformers_config["hidden_dropout_prob"],
18
+ "attention_dropout": transformers_config[
19
+ "attention_probs_dropout_prob"
20
+ ],
21
+ "layer_norm_epsilon": transformers_config["layer_norm_eps"],
22
+ }
23
+
24
+
25
+ def convert_weights(backbone, loader, transformers_config):
26
+ def port_ln(keras_variable, weight_key):
27
+ loader.port_weight(keras_variable.gamma, f"{weight_key}.weight")
28
+ loader.port_weight(keras_variable.beta, f"{weight_key}.bias")
29
+
30
+ def port_dense(keras_variable, weight_key):
31
+ loader.port_weight(
32
+ keras_variable.kernel,
33
+ f"{weight_key}.weight",
34
+ hook_fn=lambda x, _: x.T,
35
+ )
36
+ if keras_variable.bias is not None:
37
+ loader.port_weight(keras_variable.bias, f"{weight_key}.bias")
38
+
39
+ def port_mha(keras_variable, weight_key, num_heads, hidden_dim):
40
+ # query
41
+ loader.port_weight(
42
+ keras_variable.query_dense.kernel,
43
+ f"{weight_key}.attention.query.weight",
44
+ hook_fn=lambda x, _: np.reshape(
45
+ x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
46
+ ),
47
+ )
48
+ loader.port_weight(
49
+ keras_variable.query_dense.bias,
50
+ f"{weight_key}.attention.query.bias",
51
+ hook_fn=lambda x, _: np.reshape(
52
+ x, (num_heads, hidden_dim // num_heads)
53
+ ),
54
+ )
55
+ # key
56
+ loader.port_weight(
57
+ keras_variable.key_dense.kernel,
58
+ f"{weight_key}.attention.key.weight",
59
+ hook_fn=lambda x, _: np.reshape(
60
+ x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
61
+ ),
62
+ )
63
+ loader.port_weight(
64
+ keras_variable.key_dense.bias,
65
+ f"{weight_key}.attention.key.bias",
66
+ hook_fn=lambda x, _: np.reshape(
67
+ x, (num_heads, hidden_dim // num_heads)
68
+ ),
69
+ )
70
+ # value
71
+ loader.port_weight(
72
+ keras_variable.value_dense.kernel,
73
+ f"{weight_key}.attention.value.weight",
74
+ hook_fn=lambda x, _: np.reshape(
75
+ x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
76
+ ),
77
+ )
78
+ loader.port_weight(
79
+ keras_variable.value_dense.bias,
80
+ f"{weight_key}.attention.value.bias",
81
+ hook_fn=lambda x, _: np.reshape(
82
+ x, (num_heads, hidden_dim // num_heads)
83
+ ),
84
+ )
85
+ # output
86
+ loader.port_weight(
87
+ keras_variable.output_dense.kernel,
88
+ f"{weight_key}.output.dense.weight",
89
+ hook_fn=lambda x, _: np.reshape(
90
+ x.T, (num_heads, hidden_dim // num_heads, hidden_dim)
91
+ ),
92
+ )
93
+ loader.port_weight(
94
+ keras_variable.output_dense.bias, f"{weight_key}.output.dense.bias"
95
+ )
96
+
97
+ loader.port_weight(
98
+ keras_variable=backbone.layers[1].patch_embedding.kernel,
99
+ hf_weight_key="deit.embeddings.patch_embeddings.projection.weight",
100
+ hook_fn=lambda x, _: np.transpose(x, (2, 3, 1, 0)),
101
+ )
102
+
103
+ loader.port_weight(
104
+ backbone.layers[1].patch_embedding.bias,
105
+ "deit.embeddings.patch_embeddings.projection.bias",
106
+ )
107
+
108
+ loader.port_weight(
109
+ backbone.layers[1].class_token,
110
+ "deit.embeddings.cls_token",
111
+ )
112
+
113
+ loader.port_weight(
114
+ backbone.layers[1].distillation_token,
115
+ "deit.embeddings.distillation_token",
116
+ )
117
+
118
+ loader.port_weight(
119
+ backbone.layers[1].position_embedding,
120
+ "deit.embeddings.position_embeddings",
121
+ )
122
+
123
+ encoder_layers = backbone.layers[2].encoder_layers
124
+ for i, encoder_block in enumerate(encoder_layers):
125
+ prefix = "deit.encoder.layer"
126
+ num_heads = encoder_block.num_heads
127
+ hidden_dim = encoder_block.hidden_dim
128
+
129
+ port_mha(
130
+ encoder_block.mha,
131
+ f"{prefix}.{i}.attention",
132
+ num_heads,
133
+ hidden_dim,
134
+ )
135
+ port_ln(encoder_block.layer_norm_1, f"{prefix}.{i}.layernorm_before")
136
+ port_ln(encoder_block.layer_norm_2, f"{prefix}.{i}.layernorm_after")
137
+
138
+ port_dense(encoder_block.mlp.dense, f"{prefix}.{i}.intermediate.dense")
139
+ port_dense(
140
+ encoder_block.output_layer.dense, f"{prefix}.{i}.output.dense"
141
+ )
142
+ port_ln(backbone.layers[2].layer_norm, "deit.layernorm")
143
+
144
+
145
+ def convert_head(task, loader, transformers_config):
146
+ prefix = "cls_classifier."
147
+ loader.port_weight(
148
+ task.output_dense.kernel,
149
+ hf_weight_key=prefix + "weight",
150
+ hook_fn=lambda x, _: x.T,
151
+ )
152
+ loader.port_weight(
153
+ task.output_dense.bias,
154
+ hf_weight_key=prefix + "bias",
155
+ )
@@ -0,0 +1,180 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.dinov2.dinov2_backbone import DINOV2Backbone
4
+
5
+ backbone_cls = DINOV2Backbone
6
+
7
+
8
+ def convert_backbone_config(transformers_config):
9
+ model_type = transformers_config["model_type"]
10
+ antialias_in_interpolation = False if model_type == "dinov2" else True
11
+ image_size = transformers_config["image_size"]
12
+ intermediate_dim = int(
13
+ transformers_config["hidden_size"] * transformers_config["mlp_ratio"]
14
+ )
15
+ return {
16
+ "patch_size": transformers_config["patch_size"],
17
+ "num_layers": transformers_config["num_hidden_layers"],
18
+ "hidden_dim": transformers_config["hidden_size"],
19
+ "num_heads": transformers_config["num_attention_heads"],
20
+ "intermediate_dim": intermediate_dim,
21
+ "layer_scale_init_value": transformers_config["layerscale_value"],
22
+ "num_register_tokens": transformers_config.get(
23
+ "num_register_tokens", 0
24
+ ),
25
+ "use_mask_token": transformers_config.get("use_mask_token", True),
26
+ "use_swiglu_ffn": transformers_config["use_swiglu_ffn"],
27
+ "dropout_rate": transformers_config["hidden_dropout_prob"],
28
+ "drop_path_rate": transformers_config["drop_path_rate"],
29
+ "image_shape": (image_size, image_size, 3),
30
+ "position_embedding_shape": (image_size, image_size),
31
+ "antialias_in_interpolation": antialias_in_interpolation,
32
+ }
33
+
34
+
35
+ def convert_weights(backbone, loader, transformers_config):
36
+ if not isinstance(backbone, DINOV2Backbone):
37
+ raise ValueError(
38
+ "The provided backbone must be an instance of DINOV2Backbone. "
39
+ f"Received: {type(backbone)}"
40
+ )
41
+
42
+ def port_ln(keras_variable, weight_key):
43
+ loader.port_weight(keras_variable.gamma, f"{weight_key}.weight")
44
+ loader.port_weight(keras_variable.beta, f"{weight_key}.bias")
45
+
46
+ def port_dense(keras_variable, weight_key):
47
+ loader.port_weight(
48
+ keras_variable.kernel,
49
+ f"{weight_key}.weight",
50
+ hook_fn=lambda x, _: x.T,
51
+ )
52
+ if keras_variable.bias is not None:
53
+ loader.port_weight(keras_variable.bias, f"{weight_key}.bias")
54
+
55
+ def port_mha(keras_variable, weight_key, num_heads, hidden_dim):
56
+ # query
57
+ loader.port_weight(
58
+ keras_variable.query_dense.kernel,
59
+ f"{weight_key}.attention.query.weight",
60
+ hook_fn=lambda x, _: np.reshape(
61
+ x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
62
+ ),
63
+ )
64
+ loader.port_weight(
65
+ keras_variable.query_dense.bias,
66
+ f"{weight_key}.attention.query.bias",
67
+ hook_fn=lambda x, _: np.reshape(
68
+ x, (num_heads, hidden_dim // num_heads)
69
+ ),
70
+ )
71
+ # key
72
+ loader.port_weight(
73
+ keras_variable.key_dense.kernel,
74
+ f"{weight_key}.attention.key.weight",
75
+ hook_fn=lambda x, _: np.reshape(
76
+ x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
77
+ ),
78
+ )
79
+ loader.port_weight(
80
+ keras_variable.key_dense.bias,
81
+ f"{weight_key}.attention.key.bias",
82
+ hook_fn=lambda x, _: np.reshape(
83
+ x, (num_heads, hidden_dim // num_heads)
84
+ ),
85
+ )
86
+ # value
87
+ loader.port_weight(
88
+ keras_variable.value_dense.kernel,
89
+ f"{weight_key}.attention.value.weight",
90
+ hook_fn=lambda x, _: np.reshape(
91
+ x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
92
+ ),
93
+ )
94
+ loader.port_weight(
95
+ keras_variable.value_dense.bias,
96
+ f"{weight_key}.attention.value.bias",
97
+ hook_fn=lambda x, _: np.reshape(
98
+ x, (num_heads, hidden_dim // num_heads)
99
+ ),
100
+ )
101
+ # output
102
+ loader.port_weight(
103
+ keras_variable.output_dense.kernel,
104
+ f"{weight_key}.output.dense.weight",
105
+ hook_fn=lambda x, _: np.reshape(
106
+ x.T, (num_heads, hidden_dim // num_heads, hidden_dim)
107
+ ),
108
+ )
109
+ loader.port_weight(
110
+ keras_variable.output_dense.bias, f"{weight_key}.output.dense.bias"
111
+ )
112
+
113
+ # Embedding.
114
+ loader.port_weight(
115
+ keras_variable=backbone.embeddings.cls_token,
116
+ hf_weight_key="embeddings.cls_token",
117
+ )
118
+ if backbone.use_mask_token:
119
+ loader.port_weight(
120
+ keras_variable=backbone.embeddings.mask_token,
121
+ hf_weight_key="embeddings.mask_token",
122
+ )
123
+ if backbone.num_register_tokens > 0:
124
+ loader.port_weight(
125
+ keras_variable=backbone.embeddings.register_tokens,
126
+ hf_weight_key="embeddings.register_tokens",
127
+ )
128
+ loader.port_weight(
129
+ keras_variable=backbone.embeddings.position_embeddings,
130
+ hf_weight_key="embeddings.position_embeddings",
131
+ )
132
+ # Interpolate position embeddings to match the image shape.
133
+ backbone.embeddings.interpolated_position_embeddings.assign(
134
+ backbone.embeddings._interpolate_position_embeddings(
135
+ backbone.embeddings.position_embeddings,
136
+ patch_size=backbone.patch_size,
137
+ source_shape=backbone.embeddings.position_embedding_shape,
138
+ target_shape=backbone.image_shape,
139
+ antialias=backbone.embeddings.antialias_in_interpolation,
140
+ )
141
+ )
142
+ loader.port_weight(
143
+ keras_variable=backbone.embeddings.patch_embeddings.projection.kernel,
144
+ hf_weight_key="embeddings.patch_embeddings.projection.weight",
145
+ hook_fn=lambda x, _: np.transpose(x, (2, 3, 1, 0)),
146
+ )
147
+ loader.port_weight(
148
+ keras_variable=backbone.embeddings.patch_embeddings.projection.bias,
149
+ hf_weight_key="embeddings.patch_embeddings.projection.bias",
150
+ )
151
+
152
+ # Encoder.
153
+ hidden_dim = backbone.hidden_dim
154
+ num_heads = backbone.num_heads
155
+ for i, layer in enumerate(backbone.encoder.layers):
156
+ prefix = f"encoder.layer.{i}"
157
+ port_ln(layer.norm1, f"{prefix}.norm1")
158
+ port_mha(
159
+ layer.attention.attention,
160
+ f"{prefix}.attention",
161
+ num_heads,
162
+ hidden_dim,
163
+ )
164
+ loader.port_weight(
165
+ keras_variable=layer.layer_scale1.lambda1,
166
+ hf_weight_key=f"{prefix}.layer_scale1.lambda1",
167
+ )
168
+ port_ln(layer.norm2, f"{prefix}.norm2")
169
+ if backbone.use_swiglu_ffn:
170
+ port_dense(layer.mlp.weights_in, f"{prefix}.mlp.weights_in")
171
+ port_dense(layer.mlp.weights_out, f"{prefix}.mlp.weights_out")
172
+ else:
173
+ port_dense(layer.mlp.fc1, f"{prefix}.mlp.fc1")
174
+ port_dense(layer.mlp.fc2, f"{prefix}.mlp.fc2")
175
+ loader.port_weight(
176
+ keras_variable=layer.layer_scale2.lambda1,
177
+ hf_weight_key=f"{prefix}.layer_scale2.lambda1",
178
+ )
179
+
180
+ port_ln(backbone.layernorm, "layernorm")
@@ -0,0 +1,159 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.esm.esm_backbone import ESMBackbone
4
+ from keras_hub.src.utils.preset_utils import get_file
5
+
6
+ backbone_cls = ESMBackbone
7
+
8
+
9
+ def convert_backbone_config(transformers_config):
10
+ return {
11
+ "vocabulary_size": transformers_config["vocab_size"],
12
+ "num_layers": transformers_config["num_hidden_layers"],
13
+ "num_heads": transformers_config["num_attention_heads"],
14
+ "hidden_dim": transformers_config["hidden_size"],
15
+ "intermediate_dim": transformers_config["intermediate_size"],
16
+ "dropout": transformers_config["hidden_dropout_prob"],
17
+ "position_embedding_type": transformers_config[
18
+ "position_embedding_type"
19
+ ],
20
+ "pad_token_id": transformers_config["pad_token_id"],
21
+ "max_sequence_length": transformers_config.get(
22
+ "max_position_embeddings", None
23
+ ),
24
+ "layer_norm_eps": transformers_config.get("layer_norm_eps", 1e-12),
25
+ "use_pre_layer_norm": transformers_config.get(
26
+ "emb_layer_norm_before", False
27
+ ),
28
+ "activation": transformers_config.get("activation", "gelu"),
29
+ "max_wavelength": transformers_config.get("max_wavelength", 10000),
30
+ }
31
+
32
+
33
+ def transpose_and_reshape(x, shape):
34
+ return np.reshape(np.transpose(x), shape)
35
+
36
+
37
+ def convert_weights(backbone, loader, transformers_config):
38
+ # Embedding layer
39
+ loader.port_weight(
40
+ keras_variable=backbone.get_layer("token_embedding").embeddings,
41
+ hf_weight_key="embeddings.word_embeddings.weight",
42
+ )
43
+ if transformers_config["position_embedding_type"] == "absolute":
44
+ loader.port_weight(
45
+ keras_variable=backbone.get_layer(
46
+ "position_embedding"
47
+ ).position_embeddings,
48
+ hf_weight_key="embeddings.position_embeddings.weight",
49
+ )
50
+ if transformers_config.get("emb_layer_norm_before", False):
51
+ loader.port_weight(
52
+ keras_variable=backbone.get_layer("emb_layer_norm").gamma,
53
+ hf_weight_key="embeddings.layer_norm.weight",
54
+ )
55
+ loader.port_weight(
56
+ keras_variable=backbone.get_layer("emb_layer_norm").beta,
57
+ hf_weight_key="embeddings.layer_norm.bias",
58
+ )
59
+
60
+ loader.port_weight(
61
+ keras_variable=backbone.output_layer_norm.gamma,
62
+ hf_weight_key="encoder.emb_layer_norm_after.weight",
63
+ )
64
+ loader.port_weight(
65
+ keras_variable=backbone.output_layer_norm.beta,
66
+ hf_weight_key="encoder.emb_layer_norm_after.bias",
67
+ )
68
+
69
+ # Attention blocks
70
+ for i in range(backbone.num_layers):
71
+ block = backbone.get_layer(f"transformer_layer_{i}")
72
+ attn = block.attention_layer
73
+ hf_prefix = "encoder.layer."
74
+ # Attention layers
75
+ loader.port_weight(
76
+ keras_variable=attn.q_dense.kernel,
77
+ hf_weight_key=f"{hf_prefix}{i}.attention.self.query.weight",
78
+ hook_fn=transpose_and_reshape,
79
+ )
80
+ loader.port_weight(
81
+ keras_variable=attn.q_dense.bias,
82
+ hf_weight_key=f"{hf_prefix}{i}.attention.self.query.bias",
83
+ hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
84
+ )
85
+ loader.port_weight(
86
+ keras_variable=attn.k_dense.kernel,
87
+ hf_weight_key=f"{hf_prefix}{i}.attention.self.key.weight",
88
+ hook_fn=transpose_and_reshape,
89
+ )
90
+ loader.port_weight(
91
+ keras_variable=attn.k_dense.bias,
92
+ hf_weight_key=f"{hf_prefix}{i}.attention.self.key.bias",
93
+ hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
94
+ )
95
+ loader.port_weight(
96
+ keras_variable=attn.v_dense.kernel,
97
+ hf_weight_key=f"{hf_prefix}{i}.attention.self.value.weight",
98
+ hook_fn=transpose_and_reshape,
99
+ )
100
+ loader.port_weight(
101
+ keras_variable=attn.v_dense.bias,
102
+ hf_weight_key=f"{hf_prefix}{i}.attention.self.value.bias",
103
+ hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
104
+ )
105
+ loader.port_weight(
106
+ keras_variable=attn.o_dense.kernel,
107
+ hf_weight_key=f"{hf_prefix}{i}.attention.output.dense.weight",
108
+ hook_fn=transpose_and_reshape,
109
+ )
110
+ loader.port_weight(
111
+ keras_variable=attn.o_dense.bias,
112
+ hf_weight_key=f"{hf_prefix}{i}.attention.output.dense.bias",
113
+ hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
114
+ )
115
+ # Attention layer norm.
116
+ loader.port_weight(
117
+ keras_variable=block.attention_norm.gamma,
118
+ hf_weight_key=f"{hf_prefix}{i}.attention.LayerNorm.weight",
119
+ )
120
+ loader.port_weight(
121
+ keras_variable=block.attention_norm.beta,
122
+ hf_weight_key=f"{hf_prefix}{i}.attention.LayerNorm.bias",
123
+ )
124
+ # MLP layers
125
+ loader.port_weight(
126
+ keras_variable=block.feedforward_intermediate_dense.kernel,
127
+ hf_weight_key=f"{hf_prefix}{i}.intermediate.dense.weight",
128
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
129
+ )
130
+ loader.port_weight(
131
+ keras_variable=block.feedforward_intermediate_dense.bias,
132
+ hf_weight_key=f"{hf_prefix}{i}.intermediate.dense.bias",
133
+ )
134
+ loader.port_weight(
135
+ keras_variable=block.feedforward_output_dense.kernel,
136
+ hf_weight_key=f"{hf_prefix}{i}.output.dense.weight",
137
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
138
+ )
139
+ loader.port_weight(
140
+ keras_variable=block.feedforward_output_dense.bias,
141
+ hf_weight_key=f"{hf_prefix}{i}.output.dense.bias",
142
+ )
143
+ # Output layer norm.
144
+ loader.port_weight(
145
+ keras_variable=block.feedforward_norm.gamma,
146
+ hf_weight_key=f"{hf_prefix}{i}.LayerNorm.weight",
147
+ )
148
+ loader.port_weight(
149
+ keras_variable=block.feedforward_norm.beta,
150
+ hf_weight_key=f"{hf_prefix}{i}.LayerNorm.bias",
151
+ )
152
+
153
+
154
+ def convert_tokenizer(cls, preset, **kwargs):
155
+ return cls(
156
+ get_file(preset, "vocab.txt"),
157
+ lowercase=False,
158
+ **kwargs,
159
+ )
@@ -127,6 +127,12 @@ def convert_tokenizer(cls, preset, **kwargs):
127
127
  vocab = tokenizer_config["model"]["vocab"]
128
128
  merges = tokenizer_config["model"]["merges"]
129
129
 
130
+ # Handle different merge formats
131
+ if merges and isinstance(merges[0], list) and len(merges[0]) == 2:
132
+ # Convert list of lists format [["Ġ", "a"], ["Ġ", "b"]]
133
+ # to space-separated strings
134
+ merges = [" ".join(merge) for merge in merges]
135
+
130
136
  # Load all special tokens with the exception of "reserved" ones.
131
137
  special_tokens = set()
132
138
  for token in tokenizer_config["added_tokens"]:
@@ -0,0 +1,145 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
4
+ from keras_hub.src.utils.preset_utils import load_json
5
+
6
+ backbone_cls = Qwen3Backbone
7
+
8
+
9
+ def convert_backbone_config(transformers_config):
10
+ return {
11
+ "vocabulary_size": transformers_config["vocab_size"],
12
+ "head_dim": transformers_config["head_dim"],
13
+ "hidden_dim": transformers_config["hidden_size"],
14
+ "num_layers": transformers_config["num_hidden_layers"],
15
+ "num_query_heads": transformers_config["num_attention_heads"],
16
+ "num_key_value_heads": transformers_config["num_key_value_heads"],
17
+ "intermediate_dim": transformers_config["intermediate_size"],
18
+ "layer_norm_epsilon": transformers_config["rms_norm_eps"],
19
+ "rope_max_wavelength": transformers_config["rope_theta"],
20
+ "sliding_window_size": transformers_config["sliding_window"]
21
+ if transformers_config["use_sliding_window"]
22
+ else None,
23
+ "tie_word_embeddings": transformers_config["tie_word_embeddings"],
24
+ }
25
+
26
+
27
+ def convert_weights(backbone, loader, transformers_config):
28
+ loader.port_weight(
29
+ keras_variable=backbone.get_layer("token_embedding").embeddings,
30
+ hf_weight_key="model.embed_tokens.weight",
31
+ )
32
+ if not backbone.tie_word_embeddings:
33
+ loader.port_weight(
34
+ keras_variable=backbone.get_layer(
35
+ "token_embedding"
36
+ ).reverse_embeddings,
37
+ hf_weight_key="lm_head.weight",
38
+ # rearrange_pattern="b a -> a b",
39
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
40
+ )
41
+
42
+ def transpose_and_reshape(x, shape):
43
+ return np.reshape(np.transpose(x), shape)
44
+
45
+ for i in range(backbone.num_layers):
46
+ decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
47
+
48
+ # Input layernorm
49
+ loader.port_weight(
50
+ keras_variable=decoder_layer._self_attention_layernorm.scale,
51
+ hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
52
+ )
53
+
54
+ # Attention layers
55
+
56
+ ## Query
57
+ loader.port_weight(
58
+ keras_variable=decoder_layer._self_attention_layer._query_dense.kernel,
59
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
60
+ hook_fn=transpose_and_reshape,
61
+ )
62
+ loader.port_weight(
63
+ keras_variable=decoder_layer._self_attention_layer._query_dense_layer_norm.scale,
64
+ hf_weight_key=f"model.layers.{i}.self_attn.q_norm.weight",
65
+ )
66
+ ## Key
67
+ loader.port_weight(
68
+ keras_variable=decoder_layer._self_attention_layer._key_dense.kernel,
69
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
70
+ hook_fn=transpose_and_reshape,
71
+ )
72
+ loader.port_weight(
73
+ keras_variable=decoder_layer._self_attention_layer._key_dense_layer_norm.scale,
74
+ hf_weight_key=f"model.layers.{i}.self_attn.k_norm.weight",
75
+ )
76
+ ## Value
77
+ loader.port_weight(
78
+ keras_variable=decoder_layer._self_attention_layer._value_dense.kernel,
79
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
80
+ hook_fn=transpose_and_reshape,
81
+ )
82
+ ## Output
83
+ loader.port_weight(
84
+ keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
85
+ hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
86
+ # rearrange_patterns="c (a b) -> a b c",
87
+ # rearrange_dims={"a": backbone.num_query_heads},
88
+ hook_fn=transpose_and_reshape,
89
+ )
90
+
91
+ # MLP layers
92
+ loader.port_weight(
93
+ keras_variable=decoder_layer._feedforward_intermediate_dense.kernel,
94
+ hf_weight_key=f"model.layers.{i}.mlp.up_proj.weight",
95
+ # rearrange_patterns="b a -> a b",
96
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
97
+ )
98
+ loader.port_weight(
99
+ keras_variable=decoder_layer._feedforward_output_dense.kernel,
100
+ hf_weight_key=f"model.layers.{i}.mlp.down_proj.weight",
101
+ # rearrange_patterns="b a -> a b",
102
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
103
+ )
104
+ loader.port_weight(
105
+ keras_variable=decoder_layer._feedforward_gate_dense.kernel,
106
+ hf_weight_key=f"model.layers.{i}.mlp.gate_proj.weight",
107
+ # rearrange_patterns="b a -> a b",
108
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
109
+ )
110
+
111
+ # Feedforward layernorm
112
+ loader.port_weight(
113
+ keras_variable=decoder_layer._feedforward_layernorm.scale,
114
+ hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
115
+ )
116
+
117
+ # Final normalization layer
118
+ loader.port_weight(
119
+ keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
120
+ hf_weight_key="model.norm.weight",
121
+ )
122
+
123
+ return backbone
124
+
125
+
126
+ def convert_tokenizer(cls, preset, **kwargs):
127
+ tokenizer_config = load_json(preset, "tokenizer.json")
128
+ vocab = tokenizer_config["model"]["vocab"]
129
+ merges = tokenizer_config["model"]["merges"]
130
+ merges = [" ".join(item) for item in merges]
131
+
132
+ # Load all special tokens with the exception of "reserved" ones.
133
+ special_tokens = set()
134
+ for token in tokenizer_config["added_tokens"]:
135
+ if not token["content"].startswith("<|reserved_special_token_"):
136
+ vocab[token["content"]] = token["id"]
137
+ special_tokens.add(token["content"])
138
+
139
+ kwargs.update(
140
+ {
141
+ "unsplittable_tokens": list(special_tokens),
142
+ }
143
+ )
144
+
145
+ return cls(vocabulary=vocab, merges=merges, **kwargs)