keras-hub-nightly 0.24.0.dev202511220420__py3-none-any.whl → 0.26.0.dev202601010440__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of keras-hub-nightly might be problematic. Click here for more details.
- keras_hub/models/__init__.py +12 -0
- keras_hub/src/layers/modeling/reversible_embedding.py +2 -275
- keras_hub/src/layers/modeling/rotary_embedding.py +188 -14
- keras_hub/src/layers/modeling/token_and_position_embedding.py +1 -3
- keras_hub/src/models/albert/albert_backbone.py +1 -3
- keras_hub/src/models/bart/bart_backbone.py +1 -3
- keras_hub/src/models/bert/bert_backbone.py +1 -3
- keras_hub/src/models/bloom/bloom_backbone.py +1 -3
- keras_hub/src/models/causal_lm.py +23 -1
- keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -3
- keras_hub/src/models/dinov3/dinov3_presets.py +90 -1
- keras_hub/src/models/electra/electra_backbone.py +1 -3
- keras_hub/src/models/esm/esm_attention.py +11 -4
- keras_hub/src/models/f_net/f_net_backbone.py +1 -3
- keras_hub/src/models/falcon/falcon_backbone.py +1 -3
- keras_hub/src/models/gemma/gemma_backbone.py +1 -3
- keras_hub/src/models/gemma/gemma_causal_lm.py +16 -0
- keras_hub/src/models/gemma3/gemma3_backbone.py +1 -3
- keras_hub/src/models/gemma3/gemma3_causal_lm_preprocessor.py +8 -3
- keras_hub/src/models/gemma3/gemma3_presets.py +12 -0
- keras_hub/src/models/gemma3/gemma3_tokenizer.py +20 -8
- keras_hub/src/models/gpt2/gpt2_backbone.py +1 -3
- keras_hub/src/models/gpt2/gpt2_causal_lm.py +17 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +1 -3
- keras_hub/src/models/gpt_oss/__init__.py +5 -0
- keras_hub/src/models/gpt_oss/gpt_oss_attention.py +330 -0
- keras_hub/src/models/gpt_oss/gpt_oss_backbone.py +219 -0
- keras_hub/src/models/gpt_oss/gpt_oss_causal_lm.py +284 -0
- keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_preprocessor.py +79 -0
- keras_hub/src/models/gpt_oss/gpt_oss_decoder.py +444 -0
- keras_hub/src/models/gpt_oss/gpt_oss_layer_norm.py +34 -0
- keras_hub/src/models/gpt_oss/gpt_oss_presets.py +51 -0
- keras_hub/src/models/gpt_oss/gpt_oss_tokenizer.py +39 -0
- keras_hub/src/models/llama/llama_backbone.py +1 -3
- keras_hub/src/models/llama3/llama3_presets.py +1 -1
- keras_hub/src/models/masked_lm.py +22 -0
- keras_hub/src/models/mistral/mistral_backbone.py +1 -3
- keras_hub/src/models/mixtral/mixtral_backbone.py +1 -3
- keras_hub/src/models/moonshine/moonshine_backbone.py +1 -3
- keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +1 -3
- keras_hub/src/models/parseq/parseq_decoder.py +21 -9
- keras_hub/src/models/phi3/phi3_backbone.py +1 -3
- keras_hub/src/models/qwen/qwen_backbone.py +1 -3
- keras_hub/src/models/qwen3/qwen3_backbone.py +1 -3
- keras_hub/src/models/qwen3/qwen3_presets.py +36 -0
- keras_hub/src/models/qwen3_moe/qwen3_moe_backbone.py +1 -3
- keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +1 -3
- keras_hub/src/models/roformer_v2/roformer_v2_backbone.py +1 -3
- keras_hub/src/models/siglip/siglip_layers.py +1 -3
- keras_hub/src/models/smollm3/__init__.py +5 -0
- keras_hub/src/models/smollm3/smollm3_backbone.py +1 -3
- keras_hub/src/models/smollm3/smollm3_presets.py +16 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py +1 -1
- keras_hub/src/models/stable_diffusion_3/t5_encoder.py +1 -3
- keras_hub/src/models/t5/t5_backbone.py +1 -3
- keras_hub/src/models/t5gemma/t5gemma_backbone.py +1 -3
- keras_hub/src/tests/test_case.py +1 -3
- keras_hub/src/utils/transformers/convert_gemma3.py +353 -0
- keras_hub/src/utils/transformers/convert_gpt_oss.py +302 -0
- keras_hub/src/utils/transformers/preset_loader.py +12 -0
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/METADATA +4 -5
- {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/RECORD +66 -53
- {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Gpt-Oss conversion script."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from keras_hub.src.models.gpt_oss.gpt_oss_backbone import GptOssBackbone
|
|
8
|
+
from keras_hub.src.utils.preset_utils import get_file
|
|
9
|
+
|
|
10
|
+
backbone_cls = GptOssBackbone
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def convert_backbone_config(transformers_config):
|
|
14
|
+
"""Convert a Hugging Face Gpt-Oss config to a KerasHub config."""
|
|
15
|
+
config = {
|
|
16
|
+
"vocabulary_size": transformers_config["vocab_size"],
|
|
17
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
|
18
|
+
"num_query_heads": transformers_config["num_attention_heads"],
|
|
19
|
+
"hidden_dim": transformers_config["hidden_size"],
|
|
20
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
|
21
|
+
"num_key_value_heads": transformers_config["num_key_value_heads"],
|
|
22
|
+
"num_experts": transformers_config["num_local_experts"],
|
|
23
|
+
"top_k": transformers_config["num_experts_per_tok"],
|
|
24
|
+
"rope_max_wavelength": transformers_config["rope_theta"],
|
|
25
|
+
"layer_norm_epsilon": transformers_config["rms_norm_eps"],
|
|
26
|
+
"sliding_window": transformers_config.get("sliding_window"),
|
|
27
|
+
"output_router_logits": transformers_config.get(
|
|
28
|
+
"output_router_logits", False
|
|
29
|
+
),
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (
|
|
33
|
+
"head_dim" in transformers_config
|
|
34
|
+
and transformers_config["head_dim"] is not None
|
|
35
|
+
):
|
|
36
|
+
config["head_dim"] = transformers_config["head_dim"]
|
|
37
|
+
|
|
38
|
+
# Include rope_scaling for YaRN support
|
|
39
|
+
if (
|
|
40
|
+
"rope_scaling" in transformers_config
|
|
41
|
+
and transformers_config["rope_scaling"] is not None
|
|
42
|
+
):
|
|
43
|
+
config["rope_scaling_factor"] = transformers_config["rope_scaling"].get(
|
|
44
|
+
"factor", 32.0
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return config
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def convert_weights(backbone, loader, transformers_config):
|
|
51
|
+
"""Convert Gpt-Oss weights."""
|
|
52
|
+
# Embeddings
|
|
53
|
+
loader.port_weight(
|
|
54
|
+
keras_variable=backbone.token_embedding.embeddings,
|
|
55
|
+
hf_weight_key="model.embed_tokens.weight",
|
|
56
|
+
)
|
|
57
|
+
loader.port_weight(
|
|
58
|
+
keras_variable=backbone.token_embedding.reverse_embeddings,
|
|
59
|
+
hf_weight_key="lm_head.weight",
|
|
60
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
for i in range(backbone.num_layers):
|
|
64
|
+
decoder_layer = backbone.transformer_layers[i]
|
|
65
|
+
|
|
66
|
+
# Input layernorm
|
|
67
|
+
loader.port_weight(
|
|
68
|
+
keras_variable=decoder_layer.input_layernorm.scale,
|
|
69
|
+
hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Attention layers
|
|
73
|
+
attention_layer = decoder_layer.self_attention_layer
|
|
74
|
+
# Query
|
|
75
|
+
loader.port_weight(
|
|
76
|
+
keras_variable=attention_layer.query_dense.kernel,
|
|
77
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
|
|
78
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(
|
|
79
|
+
np.transpose(hf_tensor, axes=(1, 0)), shape
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
# Query bias
|
|
83
|
+
loader.port_weight(
|
|
84
|
+
keras_variable=attention_layer.query_dense.bias,
|
|
85
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.bias",
|
|
86
|
+
hook_fn=lambda hf_tensor, keras_shape: np.reshape(
|
|
87
|
+
hf_tensor, keras_shape
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Key
|
|
92
|
+
loader.port_weight(
|
|
93
|
+
keras_variable=attention_layer.key_dense.kernel,
|
|
94
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
|
|
95
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(
|
|
96
|
+
np.transpose(hf_tensor, axes=(1, 0)), shape
|
|
97
|
+
),
|
|
98
|
+
)
|
|
99
|
+
# Key bias
|
|
100
|
+
loader.port_weight(
|
|
101
|
+
keras_variable=attention_layer.key_dense.bias,
|
|
102
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.bias",
|
|
103
|
+
hook_fn=lambda hf_tensor, keras_shape: np.reshape(
|
|
104
|
+
hf_tensor, keras_shape
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Value
|
|
109
|
+
loader.port_weight(
|
|
110
|
+
keras_variable=attention_layer.value_dense.kernel,
|
|
111
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
|
|
112
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(
|
|
113
|
+
np.transpose(hf_tensor, axes=(1, 0)), shape
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
# Value bias
|
|
117
|
+
loader.port_weight(
|
|
118
|
+
keras_variable=attention_layer.value_dense.bias,
|
|
119
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.bias",
|
|
120
|
+
hook_fn=lambda hf_tensor, keras_shape: np.reshape(
|
|
121
|
+
hf_tensor, keras_shape
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Output
|
|
126
|
+
loader.port_weight(
|
|
127
|
+
keras_variable=attention_layer.output_dense.kernel,
|
|
128
|
+
hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
|
|
129
|
+
hook_fn=lambda hf_tensor, shape: np.reshape(
|
|
130
|
+
np.transpose(hf_tensor, axes=(1, 0)), shape
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
# Output bias
|
|
134
|
+
loader.port_weight(
|
|
135
|
+
keras_variable=attention_layer.output_dense.bias,
|
|
136
|
+
hf_weight_key=f"model.layers.{i}.self_attn.o_proj.bias",
|
|
137
|
+
hook_fn=lambda hf_tensor, keras_shape: np.reshape(
|
|
138
|
+
hf_tensor, keras_shape
|
|
139
|
+
),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Sink tokens
|
|
143
|
+
loader.port_weight(
|
|
144
|
+
keras_variable=attention_layer.sinks,
|
|
145
|
+
hf_weight_key=f"model.layers.{i}.self_attn.sinks",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# MoE layers
|
|
149
|
+
moe_block = decoder_layer.sparse_moe_block
|
|
150
|
+
# Router gate
|
|
151
|
+
loader.port_weight(
|
|
152
|
+
keras_variable=moe_block.router.router_dense.kernel,
|
|
153
|
+
hf_weight_key=f"model.layers.{i}.mlp.router.weight",
|
|
154
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
|
155
|
+
)
|
|
156
|
+
loader.port_weight(
|
|
157
|
+
keras_variable=moe_block.router.router_dense.bias,
|
|
158
|
+
hf_weight_key=f"model.layers.{i}.mlp.router.bias",
|
|
159
|
+
)
|
|
160
|
+
# The HF model uses MXFP4 quantization with _blocks and _scales
|
|
161
|
+
# Get quantized weights and scales
|
|
162
|
+
gate_up_blocks = loader.get_tensor(
|
|
163
|
+
f"model.layers.{i}.mlp.experts.gate_up_proj_blocks"
|
|
164
|
+
)
|
|
165
|
+
gate_up_scales = loader.get_tensor(
|
|
166
|
+
f"model.layers.{i}.mlp.experts.gate_up_proj_scales"
|
|
167
|
+
)
|
|
168
|
+
gate_up_bias = loader.get_tensor(
|
|
169
|
+
f"model.layers.{i}.mlp.experts.gate_up_proj_bias"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
down_blocks = loader.get_tensor(
|
|
173
|
+
f"model.layers.{i}.mlp.experts.down_proj_blocks"
|
|
174
|
+
)
|
|
175
|
+
down_scales = loader.get_tensor(
|
|
176
|
+
f"model.layers.{i}.mlp.experts.down_proj_scales"
|
|
177
|
+
)
|
|
178
|
+
down_bias = loader.get_tensor(
|
|
179
|
+
f"model.layers.{i}.mlp.experts.down_proj_bias"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Proper MXFP4 dequantization implementation
|
|
183
|
+
def decode_e8m0(scales_8bit: np.ndarray) -> np.ndarray:
|
|
184
|
+
"""Decode 8-bit E8M0 floats (power-of-two scale factors)."""
|
|
185
|
+
bias = 127.0
|
|
186
|
+
values = 2.0 ** (scales_8bit.astype(np.float32) - bias)
|
|
187
|
+
return values
|
|
188
|
+
|
|
189
|
+
def dequantize_mxfp4(blocks, scales):
|
|
190
|
+
"""Dequantize MXFP4 weights (E2M1 4bit, packed in uint8)."""
|
|
191
|
+
|
|
192
|
+
# Decode scales first
|
|
193
|
+
scales = decode_e8m0(scales)
|
|
194
|
+
num_experts, out_dim, num_blocks, block_size = blocks.shape
|
|
195
|
+
|
|
196
|
+
# Unpack 4bit values from uint8
|
|
197
|
+
blocks_uint8 = blocks.astype(np.uint8)
|
|
198
|
+
high_nibble = (blocks_uint8 >> 4) & 0xF
|
|
199
|
+
low_nibble = blocks_uint8 & 0xF
|
|
200
|
+
# Stack along new last axis
|
|
201
|
+
blocks_4bit = np.stack([low_nibble, high_nibble], axis=-1)
|
|
202
|
+
# Reshape to [num_experts, out_dim, num_blocks, 32 (16*2)]
|
|
203
|
+
blocks_4bit = blocks_4bit.reshape(
|
|
204
|
+
num_experts, out_dim, num_blocks, block_size * 2
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Decode E2M1 4bit values
|
|
208
|
+
s = (blocks_4bit >> 3) & 0x1
|
|
209
|
+
e = (blocks_4bit >> 1) & 0x3
|
|
210
|
+
m = blocks_4bit & 0x1
|
|
211
|
+
|
|
212
|
+
bias = 1.0
|
|
213
|
+
sign = 1.0 - 2.0 * s
|
|
214
|
+
|
|
215
|
+
normal_mask = e != 0
|
|
216
|
+
|
|
217
|
+
values = np.empty_like(blocks_4bit, dtype=np.float32)
|
|
218
|
+
|
|
219
|
+
values[normal_mask] = (
|
|
220
|
+
sign[normal_mask]
|
|
221
|
+
* (2.0 ** (e[normal_mask].astype(np.float32) - bias))
|
|
222
|
+
* (1.0 + m[normal_mask].astype(np.float32) / 2.0)
|
|
223
|
+
)
|
|
224
|
+
values[~normal_mask] = (
|
|
225
|
+
sign[~normal_mask]
|
|
226
|
+
* (2.0 ** (1.0 - bias))
|
|
227
|
+
* (m[~normal_mask].astype(np.float32) / 2.0)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
values = values.reshape(
|
|
231
|
+
num_experts, out_dim, num_blocks * block_size * 2
|
|
232
|
+
)
|
|
233
|
+
# Expand scales to match values shape
|
|
234
|
+
scales_expanded = np.repeat(
|
|
235
|
+
scales[..., np.newaxis], block_size * 2, axis=3
|
|
236
|
+
)
|
|
237
|
+
scales_expanded = scales_expanded.reshape(
|
|
238
|
+
num_experts, out_dim, num_blocks * block_size * 2
|
|
239
|
+
)
|
|
240
|
+
dequantized = values * scales_expanded
|
|
241
|
+
|
|
242
|
+
return dequantized
|
|
243
|
+
|
|
244
|
+
# Dequantize gate_up_proj weights: [32, 90, 16, 16]
|
|
245
|
+
gate_up_dequantized = dequantize_mxfp4(gate_up_blocks, gate_up_scales)
|
|
246
|
+
|
|
247
|
+
# gate_up_dequantized: [32, 90, 256, 16]
|
|
248
|
+
gate_up_proj = np.transpose(gate_up_dequantized, (0, 2, 1))
|
|
249
|
+
|
|
250
|
+
# Dequantize down_proj weights: [32, 2880, 16, 16]
|
|
251
|
+
down_dequantized = dequantize_mxfp4(down_blocks, down_scales)
|
|
252
|
+
|
|
253
|
+
down_proj = np.transpose(down_dequantized, (0, 2, 1))
|
|
254
|
+
|
|
255
|
+
moe_block.experts.gate_up_proj.assign(gate_up_proj)
|
|
256
|
+
moe_block.experts.down_proj.assign(down_proj)
|
|
257
|
+
|
|
258
|
+
# Load biases - reshape to match KerasHub format
|
|
259
|
+
moe_block.experts.gate_up_proj_bias.assign(gate_up_bias)
|
|
260
|
+
moe_block.experts.down_proj_bias.assign(down_bias)
|
|
261
|
+
|
|
262
|
+
# Post-attention layernorm
|
|
263
|
+
loader.port_weight(
|
|
264
|
+
keras_variable=decoder_layer.post_attention_layernorm.scale,
|
|
265
|
+
hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Final normalization layer
|
|
269
|
+
loader.port_weight(
|
|
270
|
+
keras_variable=backbone.layer_norm.scale,
|
|
271
|
+
hf_weight_key="model.norm.weight",
|
|
272
|
+
)
|
|
273
|
+
return backbone
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def convert_tokenizer(cls, preset, **kwargs):
|
|
277
|
+
"""Convert a Hugging Face tokenizer to a KerasHub tokenizer."""
|
|
278
|
+
|
|
279
|
+
tokenizer_file = get_file(preset, "tokenizer.json")
|
|
280
|
+
with open(tokenizer_file, "r") as f:
|
|
281
|
+
tokenizer_data = json.load(f)
|
|
282
|
+
vocabulary = tokenizer_data.get("model", {}).get("vocab", {})
|
|
283
|
+
merges = tokenizer_data.get("model", {}).get("merges", [])
|
|
284
|
+
added_tokens = tokenizer_data.get("added_tokens", [])
|
|
285
|
+
|
|
286
|
+
vocab_dict = {}
|
|
287
|
+
for token, token_id in vocabulary.items():
|
|
288
|
+
vocab_dict[token] = int(token_id)
|
|
289
|
+
|
|
290
|
+
for token_info in added_tokens:
|
|
291
|
+
token = token_info.get("content", "")
|
|
292
|
+
token_id = token_info.get("id", 0)
|
|
293
|
+
vocab_dict[token] = int(token_id)
|
|
294
|
+
|
|
295
|
+
merges_strings = []
|
|
296
|
+
for merge in merges:
|
|
297
|
+
if isinstance(merge, list) and len(merge) == 2:
|
|
298
|
+
merges_strings.append(f"{merge[0]} {merge[1]}")
|
|
299
|
+
else:
|
|
300
|
+
merges_strings.append(str(merge))
|
|
301
|
+
|
|
302
|
+
return cls(vocabulary=vocab_dict, merges=merges_strings, **kwargs)
|
|
@@ -12,7 +12,9 @@ from keras_hub.src.utils.transformers import convert_dinov3
|
|
|
12
12
|
from keras_hub.src.utils.transformers import convert_distilbert
|
|
13
13
|
from keras_hub.src.utils.transformers import convert_esm
|
|
14
14
|
from keras_hub.src.utils.transformers import convert_gemma
|
|
15
|
+
from keras_hub.src.utils.transformers import convert_gemma3
|
|
15
16
|
from keras_hub.src.utils.transformers import convert_gpt2
|
|
17
|
+
from keras_hub.src.utils.transformers import convert_gpt_oss
|
|
16
18
|
from keras_hub.src.utils.transformers import convert_llama3
|
|
17
19
|
from keras_hub.src.utils.transformers import convert_mistral
|
|
18
20
|
from keras_hub.src.utils.transformers import convert_mixtral
|
|
@@ -49,8 +51,12 @@ class TransformersPresetLoader(PresetLoader):
|
|
|
49
51
|
self.converter = convert_esm
|
|
50
52
|
elif model_type in ("gemma", "gemma2"):
|
|
51
53
|
self.converter = convert_gemma
|
|
54
|
+
elif model_type in ("gemma3", "gemma3_text"):
|
|
55
|
+
self.converter = convert_gemma3
|
|
52
56
|
elif model_type == "gpt2":
|
|
53
57
|
self.converter = convert_gpt2
|
|
58
|
+
elif model_type == "gpt_oss":
|
|
59
|
+
self.converter = convert_gpt_oss
|
|
54
60
|
elif model_type == "llama":
|
|
55
61
|
# TODO: handle other llama versions.
|
|
56
62
|
self.converter = convert_llama3
|
|
@@ -115,5 +121,11 @@ class TransformersPresetLoader(PresetLoader):
|
|
|
115
121
|
return self.converter.convert_tokenizer(cls, self.preset, **kwargs)
|
|
116
122
|
|
|
117
123
|
def load_image_converter(self, cls, **kwargs):
|
|
124
|
+
if hasattr(self.converter, "load_image_converter_config"):
|
|
125
|
+
config = self.converter.load_image_converter_config(
|
|
126
|
+
self.preset, self.config
|
|
127
|
+
)
|
|
128
|
+
if config is not None:
|
|
129
|
+
return cls(**{**config, **kwargs})
|
|
118
130
|
# TODO: set image size for pali gemma checkpoints.
|
|
119
131
|
return None
|
keras_hub/src/version.py
CHANGED
keras_hub/tokenizers/__init__.py
CHANGED
|
@@ -47,6 +47,9 @@ from keras_hub.src.models.gpt2.gpt2_tokenizer import (
|
|
|
47
47
|
from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
|
|
48
48
|
GPTNeoXTokenizer as GPTNeoXTokenizer,
|
|
49
49
|
)
|
|
50
|
+
from keras_hub.src.models.gpt_oss.gpt_oss_tokenizer import (
|
|
51
|
+
GptOssTokenizer as GptOssTokenizer,
|
|
52
|
+
)
|
|
50
53
|
from keras_hub.src.models.llama.llama_tokenizer import (
|
|
51
54
|
LlamaTokenizer as LlamaTokenizer,
|
|
52
55
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: keras-hub-nightly
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.26.0.dev202601010440
|
|
4
4
|
Summary: Pretrained models for Keras.
|
|
5
5
|
Author-email: Keras team <keras-users@googlegroups.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -8,7 +8,6 @@ Project-URL: Home, https://keras.io/keras_hub/
|
|
|
8
8
|
Project-URL: Repository, https://github.com/keras-team/keras/keras_hub
|
|
9
9
|
Classifier: Development Status :: 3 - Alpha
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
@@ -18,9 +17,9 @@ Classifier: Operating System :: MacOS
|
|
|
18
17
|
Classifier: Intended Audience :: Science/Research
|
|
19
18
|
Classifier: Topic :: Scientific/Engineering
|
|
20
19
|
Classifier: Topic :: Software Development
|
|
21
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.11
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
|
-
Requires-Dist: keras>=3.
|
|
22
|
+
Requires-Dist: keras>=3.13
|
|
24
23
|
Requires-Dist: absl-py
|
|
25
24
|
Requires-Dist: numpy
|
|
26
25
|
Requires-Dist: packaging
|
|
@@ -31,7 +30,7 @@ Requires-Dist: tensorflow-text; platform_system != "Windows"
|
|
|
31
30
|
|
|
32
31
|
# KerasHub: Multi-framework Pretrained Models
|
|
33
32
|
[](https://github.com/keras-team/keras-hub/actions?query=workflow%3ATests+branch%3Amaster)
|
|
34
|
-

|
|
35
34
|
[](https://github.com/keras-team/keras-hub/issues)
|
|
36
35
|
|
|
37
36
|
> [!IMPORTANT]
|