keras-hub 0.21.1__py3-none-any.whl → 0.22.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +9 -0
- keras_hub/models/__init__.py +47 -0
- keras_hub/src/layers/modeling/transformer_encoder.py +6 -3
- keras_hub/src/layers/preprocessing/multi_segment_packer.py +17 -3
- keras_hub/src/layers/preprocessing/start_end_packer.py +24 -6
- keras_hub/src/models/backbone.py +13 -10
- keras_hub/src/models/clip/clip_backbone.py +3 -102
- keras_hub/src/models/clip/clip_layers.py +295 -0
- keras_hub/src/models/clip/clip_preprocessor.py +57 -48
- keras_hub/src/models/clip/clip_text_encoder.py +2 -2
- keras_hub/src/models/clip/clip_vision_encoder.py +3 -3
- keras_hub/src/models/deit/__init__.py +5 -0
- keras_hub/src/models/deit/deit_backbone.py +154 -0
- keras_hub/src/models/deit/deit_image_classifier.py +171 -0
- keras_hub/src/models/deit/deit_image_classifier_preprocessor.py +12 -0
- keras_hub/src/models/deit/deit_image_converter.py +8 -0
- keras_hub/src/models/deit/deit_layers.py +519 -0
- keras_hub/src/models/deit/deit_presets.py +49 -0
- keras_hub/src/models/dinov2/__init__.py +5 -0
- keras_hub/src/models/dinov2/dinov2_backbone.py +228 -0
- keras_hub/src/models/dinov2/dinov2_image_converter.py +8 -0
- keras_hub/src/models/dinov2/dinov2_layers.py +886 -0
- keras_hub/src/models/dinov2/dinov2_presets.py +89 -0
- keras_hub/src/models/esm/__init__.py +5 -0
- keras_hub/src/models/esm/esm_attention.py +95 -0
- keras_hub/src/models/esm/esm_backbone.py +229 -0
- keras_hub/src/models/esm/esm_classifier.py +184 -0
- keras_hub/src/models/esm/esm_classifier_preprocessor.py +135 -0
- keras_hub/src/models/esm/esm_encoder.py +134 -0
- keras_hub/src/models/esm/esm_masked_plm.py +117 -0
- keras_hub/src/models/esm/esm_masked_plm_preprocessor.py +143 -0
- keras_hub/src/models/esm/esm_presets.py +53 -0
- keras_hub/src/models/esm/esm_tokenizer.py +82 -0
- keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +6 -2
- keras_hub/src/models/gemma/gemma_attention.py +1 -1
- keras_hub/src/models/gemma3/gemma3_backbone.py +2 -2
- keras_hub/src/models/gemma3/gemma3_interleave_embeddings.py +1 -1
- keras_hub/src/models/hgnetv2/__init__.py +5 -0
- keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +193 -0
- keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +148 -0
- keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py +216 -0
- keras_hub/src/models/hgnetv2/hgnetv2_image_classifier_preprocessor.py +14 -0
- keras_hub/src/models/hgnetv2/hgnetv2_image_converter.py +8 -0
- keras_hub/src/models/hgnetv2/hgnetv2_layers.py +918 -0
- keras_hub/src/models/hgnetv2/hgnetv2_presets.py +58 -0
- keras_hub/src/models/llama3/llama3_presets.py +3 -3
- keras_hub/src/models/mistral/mistral_presets.py +17 -1
- keras_hub/src/models/mixtral/mixtral_presets.py +2 -2
- keras_hub/src/models/mobilenet/mobilenet_presets.py +4 -4
- keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +2 -2
- keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +2 -2
- keras_hub/src/models/pali_gemma/pali_gemma_presets.py +17 -17
- keras_hub/src/models/qwen3/__init__.py +5 -0
- keras_hub/src/models/qwen3/qwen3_attention.py +369 -0
- keras_hub/src/models/qwen3/qwen3_backbone.py +191 -0
- keras_hub/src/models/qwen3/qwen3_causal_lm.py +390 -0
- keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py +10 -0
- keras_hub/src/models/qwen3/qwen3_decoder.py +309 -0
- keras_hub/src/models/qwen3/qwen3_layernorm.py +38 -0
- keras_hub/src/models/qwen3/qwen3_presets.py +73 -0
- keras_hub/src/models/qwen3/qwen3_tokenizer.py +48 -0
- keras_hub/src/models/qwen_moe/qwen_moe_attention.py +1 -0
- keras_hub/src/models/qwen_moe/qwen_moe_presets.py +2 -2
- keras_hub/src/models/roformer_v2/roformer_v2_attention.py +0 -2
- keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +16 -7
- keras_hub/src/models/stable_diffusion_3/mmdit.py +61 -4
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +31 -32
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +6 -2
- keras_hub/src/models/vit/vit_backbone.py +31 -11
- keras_hub/src/models/vit/vit_image_converter.py +0 -70
- keras_hub/src/models/vit/vit_layers.py +33 -18
- keras_hub/src/models/vit/vit_presets.py +11 -11
- keras_hub/src/utils/keras_utils.py +17 -0
- keras_hub/src/utils/preset_utils.py +19 -4
- keras_hub/src/utils/tensor_utils.py +14 -0
- keras_hub/src/utils/transformers/convert_deit.py +155 -0
- keras_hub/src/utils/transformers/convert_dinov2.py +180 -0
- keras_hub/src/utils/transformers/convert_esm.py +159 -0
- keras_hub/src/utils/transformers/convert_llama3.py +6 -0
- keras_hub/src/utils/transformers/convert_qwen3.py +145 -0
- keras_hub/src/utils/transformers/export/gemma.py +89 -0
- keras_hub/src/utils/transformers/export/hf_exporter.py +98 -0
- keras_hub/src/utils/transformers/preset_loader.py +14 -2
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +1 -0
- {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/METADATA +4 -4
- {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/RECORD +92 -48
- keras_hub/src/models/clip/clip_encoder_block.py +0 -111
- keras_hub/src/models/clip/clip_vision_embedding.py +0 -101
- {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/WHEEL +0 -0
- {keras_hub-0.21.1.dist-info → keras_hub-0.22.0.dev0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from keras_hub.src.models.deit.deit_backbone import DeiTBackbone
|
|
4
|
+
|
|
5
|
+
backbone_cls = DeiTBackbone
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def convert_backbone_config(transformers_config):
|
|
9
|
+
image_size = transformers_config["image_size"]
|
|
10
|
+
return {
|
|
11
|
+
"image_shape": (image_size, image_size, 3),
|
|
12
|
+
"patch_size": transformers_config["patch_size"],
|
|
13
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
14
|
+
"num_heads": transformers_config["num_attention_heads"],
|
|
15
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
16
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
|
17
|
+
"dropout_rate": transformers_config["hidden_dropout_prob"],
|
|
18
|
+
"attention_dropout": transformers_config[
|
|
19
|
+
"attention_probs_dropout_prob"
|
|
20
|
+
],
|
|
21
|
+
"layer_norm_epsilon": transformers_config["layer_norm_eps"],
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
26
|
+
def port_ln(keras_variable, weight_key):
|
|
27
|
+
loader.port_weight(keras_variable.gamma, f"{weight_key}.weight")
|
|
28
|
+
loader.port_weight(keras_variable.beta, f"{weight_key}.bias")
|
|
29
|
+
|
|
30
|
+
def port_dense(keras_variable, weight_key):
|
|
31
|
+
loader.port_weight(
|
|
32
|
+
keras_variable.kernel,
|
|
33
|
+
f"{weight_key}.weight",
|
|
34
|
+
hook_fn=lambda x, _: x.T,
|
|
35
|
+
)
|
|
36
|
+
if keras_variable.bias is not None:
|
|
37
|
+
loader.port_weight(keras_variable.bias, f"{weight_key}.bias")
|
|
38
|
+
|
|
39
|
+
def port_mha(keras_variable, weight_key, num_heads, hidden_dim):
|
|
40
|
+
# query
|
|
41
|
+
loader.port_weight(
|
|
42
|
+
keras_variable.query_dense.kernel,
|
|
43
|
+
f"{weight_key}.attention.query.weight",
|
|
44
|
+
hook_fn=lambda x, _: np.reshape(
|
|
45
|
+
x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
|
|
46
|
+
),
|
|
47
|
+
)
|
|
48
|
+
loader.port_weight(
|
|
49
|
+
keras_variable.query_dense.bias,
|
|
50
|
+
f"{weight_key}.attention.query.bias",
|
|
51
|
+
hook_fn=lambda x, _: np.reshape(
|
|
52
|
+
x, (num_heads, hidden_dim // num_heads)
|
|
53
|
+
),
|
|
54
|
+
)
|
|
55
|
+
# key
|
|
56
|
+
loader.port_weight(
|
|
57
|
+
keras_variable.key_dense.kernel,
|
|
58
|
+
f"{weight_key}.attention.key.weight",
|
|
59
|
+
hook_fn=lambda x, _: np.reshape(
|
|
60
|
+
x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
loader.port_weight(
|
|
64
|
+
keras_variable.key_dense.bias,
|
|
65
|
+
f"{weight_key}.attention.key.bias",
|
|
66
|
+
hook_fn=lambda x, _: np.reshape(
|
|
67
|
+
x, (num_heads, hidden_dim // num_heads)
|
|
68
|
+
),
|
|
69
|
+
)
|
|
70
|
+
# value
|
|
71
|
+
loader.port_weight(
|
|
72
|
+
keras_variable.value_dense.kernel,
|
|
73
|
+
f"{weight_key}.attention.value.weight",
|
|
74
|
+
hook_fn=lambda x, _: np.reshape(
|
|
75
|
+
x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
loader.port_weight(
|
|
79
|
+
keras_variable.value_dense.bias,
|
|
80
|
+
f"{weight_key}.attention.value.bias",
|
|
81
|
+
hook_fn=lambda x, _: np.reshape(
|
|
82
|
+
x, (num_heads, hidden_dim // num_heads)
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
# output
|
|
86
|
+
loader.port_weight(
|
|
87
|
+
keras_variable.output_dense.kernel,
|
|
88
|
+
f"{weight_key}.output.dense.weight",
|
|
89
|
+
hook_fn=lambda x, _: np.reshape(
|
|
90
|
+
x.T, (num_heads, hidden_dim // num_heads, hidden_dim)
|
|
91
|
+
),
|
|
92
|
+
)
|
|
93
|
+
loader.port_weight(
|
|
94
|
+
keras_variable.output_dense.bias, f"{weight_key}.output.dense.bias"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
loader.port_weight(
|
|
98
|
+
keras_variable=backbone.layers[1].patch_embedding.kernel,
|
|
99
|
+
hf_weight_key="deit.embeddings.patch_embeddings.projection.weight",
|
|
100
|
+
hook_fn=lambda x, _: np.transpose(x, (2, 3, 1, 0)),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
loader.port_weight(
|
|
104
|
+
backbone.layers[1].patch_embedding.bias,
|
|
105
|
+
"deit.embeddings.patch_embeddings.projection.bias",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
loader.port_weight(
|
|
109
|
+
backbone.layers[1].class_token,
|
|
110
|
+
"deit.embeddings.cls_token",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
loader.port_weight(
|
|
114
|
+
backbone.layers[1].distillation_token,
|
|
115
|
+
"deit.embeddings.distillation_token",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
loader.port_weight(
|
|
119
|
+
backbone.layers[1].position_embedding,
|
|
120
|
+
"deit.embeddings.position_embeddings",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
encoder_layers = backbone.layers[2].encoder_layers
|
|
124
|
+
for i, encoder_block in enumerate(encoder_layers):
|
|
125
|
+
prefix = "deit.encoder.layer"
|
|
126
|
+
num_heads = encoder_block.num_heads
|
|
127
|
+
hidden_dim = encoder_block.hidden_dim
|
|
128
|
+
|
|
129
|
+
port_mha(
|
|
130
|
+
encoder_block.mha,
|
|
131
|
+
f"{prefix}.{i}.attention",
|
|
132
|
+
num_heads,
|
|
133
|
+
hidden_dim,
|
|
134
|
+
)
|
|
135
|
+
port_ln(encoder_block.layer_norm_1, f"{prefix}.{i}.layernorm_before")
|
|
136
|
+
port_ln(encoder_block.layer_norm_2, f"{prefix}.{i}.layernorm_after")
|
|
137
|
+
|
|
138
|
+
port_dense(encoder_block.mlp.dense, f"{prefix}.{i}.intermediate.dense")
|
|
139
|
+
port_dense(
|
|
140
|
+
encoder_block.output_layer.dense, f"{prefix}.{i}.output.dense"
|
|
141
|
+
)
|
|
142
|
+
port_ln(backbone.layers[2].layer_norm, "deit.layernorm")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def convert_head(task, loader, transformers_config):
|
|
146
|
+
prefix = "cls_classifier."
|
|
147
|
+
loader.port_weight(
|
|
148
|
+
task.output_dense.kernel,
|
|
149
|
+
hf_weight_key=prefix + "weight",
|
|
150
|
+
hook_fn=lambda x, _: x.T,
|
|
151
|
+
)
|
|
152
|
+
loader.port_weight(
|
|
153
|
+
task.output_dense.bias,
|
|
154
|
+
hf_weight_key=prefix + "bias",
|
|
155
|
+
)
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from keras_hub.src.models.dinov2.dinov2_backbone import DINOV2Backbone
|
|
4
|
+
|
|
5
|
+
backbone_cls = DINOV2Backbone
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def convert_backbone_config(transformers_config):
|
|
9
|
+
model_type = transformers_config["model_type"]
|
|
10
|
+
antialias_in_interpolation = False if model_type == "dinov2" else True
|
|
11
|
+
image_size = transformers_config["image_size"]
|
|
12
|
+
intermediate_dim = int(
|
|
13
|
+
transformers_config["hidden_size"] * transformers_config["mlp_ratio"]
|
|
14
|
+
)
|
|
15
|
+
return {
|
|
16
|
+
"patch_size": transformers_config["patch_size"],
|
|
17
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
18
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
19
|
+
"num_heads": transformers_config["num_attention_heads"],
|
|
20
|
+
"intermediate_dim": intermediate_dim,
|
|
21
|
+
"layer_scale_init_value": transformers_config["layerscale_value"],
|
|
22
|
+
"num_register_tokens": transformers_config.get(
|
|
23
|
+
"num_register_tokens", 0
|
|
24
|
+
),
|
|
25
|
+
"use_mask_token": transformers_config.get("use_mask_token", True),
|
|
26
|
+
"use_swiglu_ffn": transformers_config["use_swiglu_ffn"],
|
|
27
|
+
"dropout_rate": transformers_config["hidden_dropout_prob"],
|
|
28
|
+
"drop_path_rate": transformers_config["drop_path_rate"],
|
|
29
|
+
"image_shape": (image_size, image_size, 3),
|
|
30
|
+
"position_embedding_shape": (image_size, image_size),
|
|
31
|
+
"antialias_in_interpolation": antialias_in_interpolation,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
36
|
+
if not isinstance(backbone, DINOV2Backbone):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"The provided backbone must be an instance of DINOV2Backbone. "
|
|
39
|
+
f"Received: {type(backbone)}"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def port_ln(keras_variable, weight_key):
|
|
43
|
+
loader.port_weight(keras_variable.gamma, f"{weight_key}.weight")
|
|
44
|
+
loader.port_weight(keras_variable.beta, f"{weight_key}.bias")
|
|
45
|
+
|
|
46
|
+
def port_dense(keras_variable, weight_key):
|
|
47
|
+
loader.port_weight(
|
|
48
|
+
keras_variable.kernel,
|
|
49
|
+
f"{weight_key}.weight",
|
|
50
|
+
hook_fn=lambda x, _: x.T,
|
|
51
|
+
)
|
|
52
|
+
if keras_variable.bias is not None:
|
|
53
|
+
loader.port_weight(keras_variable.bias, f"{weight_key}.bias")
|
|
54
|
+
|
|
55
|
+
def port_mha(keras_variable, weight_key, num_heads, hidden_dim):
|
|
56
|
+
# query
|
|
57
|
+
loader.port_weight(
|
|
58
|
+
keras_variable.query_dense.kernel,
|
|
59
|
+
f"{weight_key}.attention.query.weight",
|
|
60
|
+
hook_fn=lambda x, _: np.reshape(
|
|
61
|
+
x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
loader.port_weight(
|
|
65
|
+
keras_variable.query_dense.bias,
|
|
66
|
+
f"{weight_key}.attention.query.bias",
|
|
67
|
+
hook_fn=lambda x, _: np.reshape(
|
|
68
|
+
x, (num_heads, hidden_dim // num_heads)
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
# key
|
|
72
|
+
loader.port_weight(
|
|
73
|
+
keras_variable.key_dense.kernel,
|
|
74
|
+
f"{weight_key}.attention.key.weight",
|
|
75
|
+
hook_fn=lambda x, _: np.reshape(
|
|
76
|
+
x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
loader.port_weight(
|
|
80
|
+
keras_variable.key_dense.bias,
|
|
81
|
+
f"{weight_key}.attention.key.bias",
|
|
82
|
+
hook_fn=lambda x, _: np.reshape(
|
|
83
|
+
x, (num_heads, hidden_dim // num_heads)
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
# value
|
|
87
|
+
loader.port_weight(
|
|
88
|
+
keras_variable.value_dense.kernel,
|
|
89
|
+
f"{weight_key}.attention.value.weight",
|
|
90
|
+
hook_fn=lambda x, _: np.reshape(
|
|
91
|
+
x.T, (hidden_dim, num_heads, hidden_dim // num_heads)
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
loader.port_weight(
|
|
95
|
+
keras_variable.value_dense.bias,
|
|
96
|
+
f"{weight_key}.attention.value.bias",
|
|
97
|
+
hook_fn=lambda x, _: np.reshape(
|
|
98
|
+
x, (num_heads, hidden_dim // num_heads)
|
|
99
|
+
),
|
|
100
|
+
)
|
|
101
|
+
# output
|
|
102
|
+
loader.port_weight(
|
|
103
|
+
keras_variable.output_dense.kernel,
|
|
104
|
+
f"{weight_key}.output.dense.weight",
|
|
105
|
+
hook_fn=lambda x, _: np.reshape(
|
|
106
|
+
x.T, (num_heads, hidden_dim // num_heads, hidden_dim)
|
|
107
|
+
),
|
|
108
|
+
)
|
|
109
|
+
loader.port_weight(
|
|
110
|
+
keras_variable.output_dense.bias, f"{weight_key}.output.dense.bias"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Embedding.
|
|
114
|
+
loader.port_weight(
|
|
115
|
+
keras_variable=backbone.embeddings.cls_token,
|
|
116
|
+
hf_weight_key="embeddings.cls_token",
|
|
117
|
+
)
|
|
118
|
+
if backbone.use_mask_token:
|
|
119
|
+
loader.port_weight(
|
|
120
|
+
keras_variable=backbone.embeddings.mask_token,
|
|
121
|
+
hf_weight_key="embeddings.mask_token",
|
|
122
|
+
)
|
|
123
|
+
if backbone.num_register_tokens > 0:
|
|
124
|
+
loader.port_weight(
|
|
125
|
+
keras_variable=backbone.embeddings.register_tokens,
|
|
126
|
+
hf_weight_key="embeddings.register_tokens",
|
|
127
|
+
)
|
|
128
|
+
loader.port_weight(
|
|
129
|
+
keras_variable=backbone.embeddings.position_embeddings,
|
|
130
|
+
hf_weight_key="embeddings.position_embeddings",
|
|
131
|
+
)
|
|
132
|
+
# Interpolate position embeddings to match the image shape.
|
|
133
|
+
backbone.embeddings.interpolated_position_embeddings.assign(
|
|
134
|
+
backbone.embeddings._interpolate_position_embeddings(
|
|
135
|
+
backbone.embeddings.position_embeddings,
|
|
136
|
+
patch_size=backbone.patch_size,
|
|
137
|
+
source_shape=backbone.embeddings.position_embedding_shape,
|
|
138
|
+
target_shape=backbone.image_shape,
|
|
139
|
+
antialias=backbone.embeddings.antialias_in_interpolation,
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
loader.port_weight(
|
|
143
|
+
keras_variable=backbone.embeddings.patch_embeddings.projection.kernel,
|
|
144
|
+
hf_weight_key="embeddings.patch_embeddings.projection.weight",
|
|
145
|
+
hook_fn=lambda x, _: np.transpose(x, (2, 3, 1, 0)),
|
|
146
|
+
)
|
|
147
|
+
loader.port_weight(
|
|
148
|
+
keras_variable=backbone.embeddings.patch_embeddings.projection.bias,
|
|
149
|
+
hf_weight_key="embeddings.patch_embeddings.projection.bias",
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Encoder.
|
|
153
|
+
hidden_dim = backbone.hidden_dim
|
|
154
|
+
num_heads = backbone.num_heads
|
|
155
|
+
for i, layer in enumerate(backbone.encoder.layers):
|
|
156
|
+
prefix = f"encoder.layer.{i}"
|
|
157
|
+
port_ln(layer.norm1, f"{prefix}.norm1")
|
|
158
|
+
port_mha(
|
|
159
|
+
layer.attention.attention,
|
|
160
|
+
f"{prefix}.attention",
|
|
161
|
+
num_heads,
|
|
162
|
+
hidden_dim,
|
|
163
|
+
)
|
|
164
|
+
loader.port_weight(
|
|
165
|
+
keras_variable=layer.layer_scale1.lambda1,
|
|
166
|
+
hf_weight_key=f"{prefix}.layer_scale1.lambda1",
|
|
167
|
+
)
|
|
168
|
+
port_ln(layer.norm2, f"{prefix}.norm2")
|
|
169
|
+
if backbone.use_swiglu_ffn:
|
|
170
|
+
port_dense(layer.mlp.weights_in, f"{prefix}.mlp.weights_in")
|
|
171
|
+
port_dense(layer.mlp.weights_out, f"{prefix}.mlp.weights_out")
|
|
172
|
+
else:
|
|
173
|
+
port_dense(layer.mlp.fc1, f"{prefix}.mlp.fc1")
|
|
174
|
+
port_dense(layer.mlp.fc2, f"{prefix}.mlp.fc2")
|
|
175
|
+
loader.port_weight(
|
|
176
|
+
keras_variable=layer.layer_scale2.lambda1,
|
|
177
|
+
hf_weight_key=f"{prefix}.layer_scale2.lambda1",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
port_ln(backbone.layernorm, "layernorm")
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from keras_hub.src.models.esm.esm_backbone import ESMBackbone
|
|
4
|
+
from keras_hub.src.utils.preset_utils import get_file
|
|
5
|
+
|
|
6
|
+
backbone_cls = ESMBackbone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_backbone_config(transformers_config):
|
|
10
|
+
return {
|
|
11
|
+
"vocabulary_size": transformers_config["vocab_size"],
|
|
12
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
13
|
+
"num_heads": transformers_config["num_attention_heads"],
|
|
14
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
15
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
|
16
|
+
"dropout": transformers_config["hidden_dropout_prob"],
|
|
17
|
+
"position_embedding_type": transformers_config[
|
|
18
|
+
"position_embedding_type"
|
|
19
|
+
],
|
|
20
|
+
"pad_token_id": transformers_config["pad_token_id"],
|
|
21
|
+
"max_sequence_length": transformers_config.get(
|
|
22
|
+
"max_position_embeddings", None
|
|
23
|
+
),
|
|
24
|
+
"layer_norm_eps": transformers_config.get("layer_norm_eps", 1e-12),
|
|
25
|
+
"use_pre_layer_norm": transformers_config.get(
|
|
26
|
+
"emb_layer_norm_before", False
|
|
27
|
+
),
|
|
28
|
+
"activation": transformers_config.get("activation", "gelu"),
|
|
29
|
+
"max_wavelength": transformers_config.get("max_wavelength", 10000),
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def transpose_and_reshape(x, shape):
|
|
34
|
+
return np.reshape(np.transpose(x), shape)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
38
|
+
# Embedding layer
|
|
39
|
+
loader.port_weight(
|
|
40
|
+
keras_variable=backbone.get_layer("token_embedding").embeddings,
|
|
41
|
+
hf_weight_key="embeddings.word_embeddings.weight",
|
|
42
|
+
)
|
|
43
|
+
if transformers_config["position_embedding_type"] == "absolute":
|
|
44
|
+
loader.port_weight(
|
|
45
|
+
keras_variable=backbone.get_layer(
|
|
46
|
+
"position_embedding"
|
|
47
|
+
).position_embeddings,
|
|
48
|
+
hf_weight_key="embeddings.position_embeddings.weight",
|
|
49
|
+
)
|
|
50
|
+
if transformers_config.get("emb_layer_norm_before", False):
|
|
51
|
+
loader.port_weight(
|
|
52
|
+
keras_variable=backbone.get_layer("emb_layer_norm").gamma,
|
|
53
|
+
hf_weight_key="embeddings.layer_norm.weight",
|
|
54
|
+
)
|
|
55
|
+
loader.port_weight(
|
|
56
|
+
keras_variable=backbone.get_layer("emb_layer_norm").beta,
|
|
57
|
+
hf_weight_key="embeddings.layer_norm.bias",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
loader.port_weight(
|
|
61
|
+
keras_variable=backbone.output_layer_norm.gamma,
|
|
62
|
+
hf_weight_key="encoder.emb_layer_norm_after.weight",
|
|
63
|
+
)
|
|
64
|
+
loader.port_weight(
|
|
65
|
+
keras_variable=backbone.output_layer_norm.beta,
|
|
66
|
+
hf_weight_key="encoder.emb_layer_norm_after.bias",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Attention blocks
|
|
70
|
+
for i in range(backbone.num_layers):
|
|
71
|
+
block = backbone.get_layer(f"transformer_layer_{i}")
|
|
72
|
+
attn = block.attention_layer
|
|
73
|
+
hf_prefix = "encoder.layer."
|
|
74
|
+
# Attention layers
|
|
75
|
+
loader.port_weight(
|
|
76
|
+
keras_variable=attn.q_dense.kernel,
|
|
77
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.self.query.weight",
|
|
78
|
+
hook_fn=transpose_and_reshape,
|
|
79
|
+
)
|
|
80
|
+
loader.port_weight(
|
|
81
|
+
keras_variable=attn.q_dense.bias,
|
|
82
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.self.query.bias",
|
|
83
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
|
|
84
|
+
)
|
|
85
|
+
loader.port_weight(
|
|
86
|
+
keras_variable=attn.k_dense.kernel,
|
|
87
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.self.key.weight",
|
|
88
|
+
hook_fn=transpose_and_reshape,
|
|
89
|
+
)
|
|
90
|
+
loader.port_weight(
|
|
91
|
+
keras_variable=attn.k_dense.bias,
|
|
92
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.self.key.bias",
|
|
93
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
|
|
94
|
+
)
|
|
95
|
+
loader.port_weight(
|
|
96
|
+
keras_variable=attn.v_dense.kernel,
|
|
97
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.self.value.weight",
|
|
98
|
+
hook_fn=transpose_and_reshape,
|
|
99
|
+
)
|
|
100
|
+
loader.port_weight(
|
|
101
|
+
keras_variable=attn.v_dense.bias,
|
|
102
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.self.value.bias",
|
|
103
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
|
|
104
|
+
)
|
|
105
|
+
loader.port_weight(
|
|
106
|
+
keras_variable=attn.o_dense.kernel,
|
|
107
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.output.dense.weight",
|
|
108
|
+
hook_fn=transpose_and_reshape,
|
|
109
|
+
)
|
|
110
|
+
loader.port_weight(
|
|
111
|
+
keras_variable=attn.o_dense.bias,
|
|
112
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.output.dense.bias",
|
|
113
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(hf_tensor, shape),
|
|
114
|
+
)
|
|
115
|
+
# Attention layer norm.
|
|
116
|
+
loader.port_weight(
|
|
117
|
+
keras_variable=block.attention_norm.gamma,
|
|
118
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.LayerNorm.weight",
|
|
119
|
+
)
|
|
120
|
+
loader.port_weight(
|
|
121
|
+
keras_variable=block.attention_norm.beta,
|
|
122
|
+
hf_weight_key=f"{hf_prefix}{i}.attention.LayerNorm.bias",
|
|
123
|
+
)
|
|
124
|
+
# MLP layers
|
|
125
|
+
loader.port_weight(
|
|
126
|
+
keras_variable=block.feedforward_intermediate_dense.kernel,
|
|
127
|
+
hf_weight_key=f"{hf_prefix}{i}.intermediate.dense.weight",
|
|
128
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
129
|
+
)
|
|
130
|
+
loader.port_weight(
|
|
131
|
+
keras_variable=block.feedforward_intermediate_dense.bias,
|
|
132
|
+
hf_weight_key=f"{hf_prefix}{i}.intermediate.dense.bias",
|
|
133
|
+
)
|
|
134
|
+
loader.port_weight(
|
|
135
|
+
keras_variable=block.feedforward_output_dense.kernel,
|
|
136
|
+
hf_weight_key=f"{hf_prefix}{i}.output.dense.weight",
|
|
137
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
138
|
+
)
|
|
139
|
+
loader.port_weight(
|
|
140
|
+
keras_variable=block.feedforward_output_dense.bias,
|
|
141
|
+
hf_weight_key=f"{hf_prefix}{i}.output.dense.bias",
|
|
142
|
+
)
|
|
143
|
+
# Output layer norm.
|
|
144
|
+
loader.port_weight(
|
|
145
|
+
keras_variable=block.feedforward_norm.gamma,
|
|
146
|
+
hf_weight_key=f"{hf_prefix}{i}.LayerNorm.weight",
|
|
147
|
+
)
|
|
148
|
+
loader.port_weight(
|
|
149
|
+
keras_variable=block.feedforward_norm.beta,
|
|
150
|
+
hf_weight_key=f"{hf_prefix}{i}.LayerNorm.bias",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def convert_tokenizer(cls, preset, **kwargs):
|
|
155
|
+
return cls(
|
|
156
|
+
get_file(preset, "vocab.txt"),
|
|
157
|
+
lowercase=False,
|
|
158
|
+
**kwargs,
|
|
159
|
+
)
|
|
@@ -127,6 +127,12 @@ def convert_tokenizer(cls, preset, **kwargs):
|
|
|
127
127
|
vocab = tokenizer_config["model"]["vocab"]
|
|
128
128
|
merges = tokenizer_config["model"]["merges"]
|
|
129
129
|
|
|
130
|
+
# Handle different merge formats
|
|
131
|
+
if merges and isinstance(merges[0], list) and len(merges[0]) == 2:
|
|
132
|
+
# Convert list of lists format [["Ġ", "a"], ["Ġ", "b"]]
|
|
133
|
+
# to space-separated strings
|
|
134
|
+
merges = [" ".join(merge) for merge in merges]
|
|
135
|
+
|
|
130
136
|
# Load all special tokens with the exception of "reserved" ones.
|
|
131
137
|
special_tokens = set()
|
|
132
138
|
for token in tokenizer_config["added_tokens"]:
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
|
|
4
|
+
from keras_hub.src.utils.preset_utils import load_json
|
|
5
|
+
|
|
6
|
+
backbone_cls = Qwen3Backbone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_backbone_config(transformers_config):
|
|
10
|
+
return {
|
|
11
|
+
"vocabulary_size": transformers_config["vocab_size"],
|
|
12
|
+
"head_dim": transformers_config["head_dim"],
|
|
13
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
14
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
15
|
+
"num_query_heads": transformers_config["num_attention_heads"],
|
|
16
|
+
"num_key_value_heads": transformers_config["num_key_value_heads"],
|
|
17
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
|
18
|
+
"layer_norm_epsilon": transformers_config["rms_norm_eps"],
|
|
19
|
+
"rope_max_wavelength": transformers_config["rope_theta"],
|
|
20
|
+
"sliding_window_size": transformers_config["sliding_window"]
|
|
21
|
+
if transformers_config["use_sliding_window"]
|
|
22
|
+
else None,
|
|
23
|
+
"tie_word_embeddings": transformers_config["tie_word_embeddings"],
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
28
|
+
loader.port_weight(
|
|
29
|
+
keras_variable=backbone.get_layer("token_embedding").embeddings,
|
|
30
|
+
hf_weight_key="model.embed_tokens.weight",
|
|
31
|
+
)
|
|
32
|
+
if not backbone.tie_word_embeddings:
|
|
33
|
+
loader.port_weight(
|
|
34
|
+
keras_variable=backbone.get_layer(
|
|
35
|
+
"token_embedding"
|
|
36
|
+
).reverse_embeddings,
|
|
37
|
+
hf_weight_key="lm_head.weight",
|
|
38
|
+
# rearrange_pattern="b a -> a b",
|
|
39
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def transpose_and_reshape(x, shape):
|
|
43
|
+
return np.reshape(np.transpose(x), shape)
|
|
44
|
+
|
|
45
|
+
for i in range(backbone.num_layers):
|
|
46
|
+
decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
|
|
47
|
+
|
|
48
|
+
# Input layernorm
|
|
49
|
+
loader.port_weight(
|
|
50
|
+
keras_variable=decoder_layer._self_attention_layernorm.scale,
|
|
51
|
+
hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Attention layers
|
|
55
|
+
|
|
56
|
+
## Query
|
|
57
|
+
loader.port_weight(
|
|
58
|
+
keras_variable=decoder_layer._self_attention_layer._query_dense.kernel,
|
|
59
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
|
|
60
|
+
hook_fn=transpose_and_reshape,
|
|
61
|
+
)
|
|
62
|
+
loader.port_weight(
|
|
63
|
+
keras_variable=decoder_layer._self_attention_layer._query_dense_layer_norm.scale,
|
|
64
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_norm.weight",
|
|
65
|
+
)
|
|
66
|
+
## Key
|
|
67
|
+
loader.port_weight(
|
|
68
|
+
keras_variable=decoder_layer._self_attention_layer._key_dense.kernel,
|
|
69
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
|
|
70
|
+
hook_fn=transpose_and_reshape,
|
|
71
|
+
)
|
|
72
|
+
loader.port_weight(
|
|
73
|
+
keras_variable=decoder_layer._self_attention_layer._key_dense_layer_norm.scale,
|
|
74
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_norm.weight",
|
|
75
|
+
)
|
|
76
|
+
## Value
|
|
77
|
+
loader.port_weight(
|
|
78
|
+
keras_variable=decoder_layer._self_attention_layer._value_dense.kernel,
|
|
79
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
|
|
80
|
+
hook_fn=transpose_and_reshape,
|
|
81
|
+
)
|
|
82
|
+
## Output
|
|
83
|
+
loader.port_weight(
|
|
84
|
+
keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
|
|
85
|
+
hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
|
|
86
|
+
# rearrange_patterns="c (a b) -> a b c",
|
|
87
|
+
# rearrange_dims={"a": backbone.num_query_heads},
|
|
88
|
+
hook_fn=transpose_and_reshape,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# MLP layers
|
|
92
|
+
loader.port_weight(
|
|
93
|
+
keras_variable=decoder_layer._feedforward_intermediate_dense.kernel,
|
|
94
|
+
hf_weight_key=f"model.layers.{i}.mlp.up_proj.weight",
|
|
95
|
+
# rearrange_patterns="b a -> a b",
|
|
96
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
97
|
+
)
|
|
98
|
+
loader.port_weight(
|
|
99
|
+
keras_variable=decoder_layer._feedforward_output_dense.kernel,
|
|
100
|
+
hf_weight_key=f"model.layers.{i}.mlp.down_proj.weight",
|
|
101
|
+
# rearrange_patterns="b a -> a b",
|
|
102
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
103
|
+
)
|
|
104
|
+
loader.port_weight(
|
|
105
|
+
keras_variable=decoder_layer._feedforward_gate_dense.kernel,
|
|
106
|
+
hf_weight_key=f"model.layers.{i}.mlp.gate_proj.weight",
|
|
107
|
+
# rearrange_patterns="b a -> a b",
|
|
108
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Feedforward layernorm
|
|
112
|
+
loader.port_weight(
|
|
113
|
+
keras_variable=decoder_layer._feedforward_layernorm.scale,
|
|
114
|
+
hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Final normalization layer
|
|
118
|
+
loader.port_weight(
|
|
119
|
+
keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
|
|
120
|
+
hf_weight_key="model.norm.weight",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return backbone
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def convert_tokenizer(cls, preset, **kwargs):
|
|
127
|
+
tokenizer_config = load_json(preset, "tokenizer.json")
|
|
128
|
+
vocab = tokenizer_config["model"]["vocab"]
|
|
129
|
+
merges = tokenizer_config["model"]["merges"]
|
|
130
|
+
merges = [" ".join(item) for item in merges]
|
|
131
|
+
|
|
132
|
+
# Load all special tokens with the exception of "reserved" ones.
|
|
133
|
+
special_tokens = set()
|
|
134
|
+
for token in tokenizer_config["added_tokens"]:
|
|
135
|
+
if not token["content"].startswith("<|reserved_special_token_"):
|
|
136
|
+
vocab[token["content"]] = token["id"]
|
|
137
|
+
special_tokens.add(token["content"])
|
|
138
|
+
|
|
139
|
+
kwargs.update(
|
|
140
|
+
{
|
|
141
|
+
"unsplittable_tokens": list(special_tokens),
|
|
142
|
+
}
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return cls(vocabulary=vocab, merges=merges, **kwargs)
|