keras-hub-nightly 0.24.0.dev202511220420__py3-none-any.whl → 0.26.0.dev202601010440__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of keras-hub-nightly might be problematic. Click here for more details.

Files changed (66) hide show
  1. keras_hub/models/__init__.py +12 -0
  2. keras_hub/src/layers/modeling/reversible_embedding.py +2 -275
  3. keras_hub/src/layers/modeling/rotary_embedding.py +188 -14
  4. keras_hub/src/layers/modeling/token_and_position_embedding.py +1 -3
  5. keras_hub/src/models/albert/albert_backbone.py +1 -3
  6. keras_hub/src/models/bart/bart_backbone.py +1 -3
  7. keras_hub/src/models/bert/bert_backbone.py +1 -3
  8. keras_hub/src/models/bloom/bloom_backbone.py +1 -3
  9. keras_hub/src/models/causal_lm.py +23 -1
  10. keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -3
  11. keras_hub/src/models/dinov3/dinov3_presets.py +90 -1
  12. keras_hub/src/models/electra/electra_backbone.py +1 -3
  13. keras_hub/src/models/esm/esm_attention.py +11 -4
  14. keras_hub/src/models/f_net/f_net_backbone.py +1 -3
  15. keras_hub/src/models/falcon/falcon_backbone.py +1 -3
  16. keras_hub/src/models/gemma/gemma_backbone.py +1 -3
  17. keras_hub/src/models/gemma/gemma_causal_lm.py +16 -0
  18. keras_hub/src/models/gemma3/gemma3_backbone.py +1 -3
  19. keras_hub/src/models/gemma3/gemma3_causal_lm_preprocessor.py +8 -3
  20. keras_hub/src/models/gemma3/gemma3_presets.py +12 -0
  21. keras_hub/src/models/gemma3/gemma3_tokenizer.py +20 -8
  22. keras_hub/src/models/gpt2/gpt2_backbone.py +1 -3
  23. keras_hub/src/models/gpt2/gpt2_causal_lm.py +17 -0
  24. keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +1 -3
  25. keras_hub/src/models/gpt_oss/__init__.py +5 -0
  26. keras_hub/src/models/gpt_oss/gpt_oss_attention.py +330 -0
  27. keras_hub/src/models/gpt_oss/gpt_oss_backbone.py +219 -0
  28. keras_hub/src/models/gpt_oss/gpt_oss_causal_lm.py +284 -0
  29. keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_preprocessor.py +79 -0
  30. keras_hub/src/models/gpt_oss/gpt_oss_decoder.py +444 -0
  31. keras_hub/src/models/gpt_oss/gpt_oss_layer_norm.py +34 -0
  32. keras_hub/src/models/gpt_oss/gpt_oss_presets.py +51 -0
  33. keras_hub/src/models/gpt_oss/gpt_oss_tokenizer.py +39 -0
  34. keras_hub/src/models/llama/llama_backbone.py +1 -3
  35. keras_hub/src/models/llama3/llama3_presets.py +1 -1
  36. keras_hub/src/models/masked_lm.py +22 -0
  37. keras_hub/src/models/mistral/mistral_backbone.py +1 -3
  38. keras_hub/src/models/mixtral/mixtral_backbone.py +1 -3
  39. keras_hub/src/models/moonshine/moonshine_backbone.py +1 -3
  40. keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +1 -3
  41. keras_hub/src/models/parseq/parseq_decoder.py +21 -9
  42. keras_hub/src/models/phi3/phi3_backbone.py +1 -3
  43. keras_hub/src/models/qwen/qwen_backbone.py +1 -3
  44. keras_hub/src/models/qwen3/qwen3_backbone.py +1 -3
  45. keras_hub/src/models/qwen3/qwen3_presets.py +36 -0
  46. keras_hub/src/models/qwen3_moe/qwen3_moe_backbone.py +1 -3
  47. keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +1 -3
  48. keras_hub/src/models/roformer_v2/roformer_v2_backbone.py +1 -3
  49. keras_hub/src/models/siglip/siglip_layers.py +1 -3
  50. keras_hub/src/models/smollm3/__init__.py +5 -0
  51. keras_hub/src/models/smollm3/smollm3_backbone.py +1 -3
  52. keras_hub/src/models/smollm3/smollm3_presets.py +16 -0
  53. keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py +1 -1
  54. keras_hub/src/models/stable_diffusion_3/t5_encoder.py +1 -3
  55. keras_hub/src/models/t5/t5_backbone.py +1 -3
  56. keras_hub/src/models/t5gemma/t5gemma_backbone.py +1 -3
  57. keras_hub/src/tests/test_case.py +1 -3
  58. keras_hub/src/utils/transformers/convert_gemma3.py +353 -0
  59. keras_hub/src/utils/transformers/convert_gpt_oss.py +302 -0
  60. keras_hub/src/utils/transformers/preset_loader.py +12 -0
  61. keras_hub/src/version.py +1 -1
  62. keras_hub/tokenizers/__init__.py +3 -0
  63. {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/METADATA +4 -5
  64. {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/RECORD +66 -53
  65. {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/WHEEL +0 -0
  66. {keras_hub_nightly-0.24.0.dev202511220420.dist-info → keras_hub_nightly-0.26.0.dev202601010440.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,302 @@
1
+ """Gpt-Oss conversion script."""
2
+
3
+ import json
4
+
5
+ import numpy as np
6
+
7
+ from keras_hub.src.models.gpt_oss.gpt_oss_backbone import GptOssBackbone
8
+ from keras_hub.src.utils.preset_utils import get_file
9
+
10
+ backbone_cls = GptOssBackbone
11
+
12
+
13
+ def convert_backbone_config(transformers_config):
14
+ """Convert a Hugging Face Gpt-Oss config to a KerasHub config."""
15
+ config = {
16
+ "vocabulary_size": transformers_config["vocab_size"],
17
+ "num_layers": transformers_config["num_hidden_layers"],
18
+ "num_query_heads": transformers_config["num_attention_heads"],
19
+ "hidden_dim": transformers_config["hidden_size"],
20
+ "intermediate_dim": transformers_config["intermediate_size"],
21
+ "num_key_value_heads": transformers_config["num_key_value_heads"],
22
+ "num_experts": transformers_config["num_local_experts"],
23
+ "top_k": transformers_config["num_experts_per_tok"],
24
+ "rope_max_wavelength": transformers_config["rope_theta"],
25
+ "layer_norm_epsilon": transformers_config["rms_norm_eps"],
26
+ "sliding_window": transformers_config.get("sliding_window"),
27
+ "output_router_logits": transformers_config.get(
28
+ "output_router_logits", False
29
+ ),
30
+ }
31
+
32
+ if (
33
+ "head_dim" in transformers_config
34
+ and transformers_config["head_dim"] is not None
35
+ ):
36
+ config["head_dim"] = transformers_config["head_dim"]
37
+
38
+ # Include rope_scaling for YaRN support
39
+ if (
40
+ "rope_scaling" in transformers_config
41
+ and transformers_config["rope_scaling"] is not None
42
+ ):
43
+ config["rope_scaling_factor"] = transformers_config["rope_scaling"].get(
44
+ "factor", 32.0
45
+ )
46
+
47
+ return config
48
+
49
+
50
+ def convert_weights(backbone, loader, transformers_config):
51
+ """Convert Gpt-Oss weights."""
52
+ # Embeddings
53
+ loader.port_weight(
54
+ keras_variable=backbone.token_embedding.embeddings,
55
+ hf_weight_key="model.embed_tokens.weight",
56
+ )
57
+ loader.port_weight(
58
+ keras_variable=backbone.token_embedding.reverse_embeddings,
59
+ hf_weight_key="lm_head.weight",
60
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
61
+ )
62
+
63
+ for i in range(backbone.num_layers):
64
+ decoder_layer = backbone.transformer_layers[i]
65
+
66
+ # Input layernorm
67
+ loader.port_weight(
68
+ keras_variable=decoder_layer.input_layernorm.scale,
69
+ hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
70
+ )
71
+
72
+ # Attention layers
73
+ attention_layer = decoder_layer.self_attention_layer
74
+ # Query
75
+ loader.port_weight(
76
+ keras_variable=attention_layer.query_dense.kernel,
77
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
78
+ hook_fn=lambda hf_tensor, shape: np.reshape(
79
+ np.transpose(hf_tensor, axes=(1, 0)), shape
80
+ ),
81
+ )
82
+ # Query bias
83
+ loader.port_weight(
84
+ keras_variable=attention_layer.query_dense.bias,
85
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.bias",
86
+ hook_fn=lambda hf_tensor, keras_shape: np.reshape(
87
+ hf_tensor, keras_shape
88
+ ),
89
+ )
90
+
91
+ # Key
92
+ loader.port_weight(
93
+ keras_variable=attention_layer.key_dense.kernel,
94
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
95
+ hook_fn=lambda hf_tensor, shape: np.reshape(
96
+ np.transpose(hf_tensor, axes=(1, 0)), shape
97
+ ),
98
+ )
99
+ # Key bias
100
+ loader.port_weight(
101
+ keras_variable=attention_layer.key_dense.bias,
102
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.bias",
103
+ hook_fn=lambda hf_tensor, keras_shape: np.reshape(
104
+ hf_tensor, keras_shape
105
+ ),
106
+ )
107
+
108
+ # Value
109
+ loader.port_weight(
110
+ keras_variable=attention_layer.value_dense.kernel,
111
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
112
+ hook_fn=lambda hf_tensor, shape: np.reshape(
113
+ np.transpose(hf_tensor, axes=(1, 0)), shape
114
+ ),
115
+ )
116
+ # Value bias
117
+ loader.port_weight(
118
+ keras_variable=attention_layer.value_dense.bias,
119
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.bias",
120
+ hook_fn=lambda hf_tensor, keras_shape: np.reshape(
121
+ hf_tensor, keras_shape
122
+ ),
123
+ )
124
+
125
+ # Output
126
+ loader.port_weight(
127
+ keras_variable=attention_layer.output_dense.kernel,
128
+ hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
129
+ hook_fn=lambda hf_tensor, shape: np.reshape(
130
+ np.transpose(hf_tensor, axes=(1, 0)), shape
131
+ ),
132
+ )
133
+ # Output bias
134
+ loader.port_weight(
135
+ keras_variable=attention_layer.output_dense.bias,
136
+ hf_weight_key=f"model.layers.{i}.self_attn.o_proj.bias",
137
+ hook_fn=lambda hf_tensor, keras_shape: np.reshape(
138
+ hf_tensor, keras_shape
139
+ ),
140
+ )
141
+
142
+ # Sink tokens
143
+ loader.port_weight(
144
+ keras_variable=attention_layer.sinks,
145
+ hf_weight_key=f"model.layers.{i}.self_attn.sinks",
146
+ )
147
+
148
+ # MoE layers
149
+ moe_block = decoder_layer.sparse_moe_block
150
+ # Router gate
151
+ loader.port_weight(
152
+ keras_variable=moe_block.router.router_dense.kernel,
153
+ hf_weight_key=f"model.layers.{i}.mlp.router.weight",
154
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
155
+ )
156
+ loader.port_weight(
157
+ keras_variable=moe_block.router.router_dense.bias,
158
+ hf_weight_key=f"model.layers.{i}.mlp.router.bias",
159
+ )
160
+ # The HF model uses MXFP4 quantization with _blocks and _scales
161
+ # Get quantized weights and scales
162
+ gate_up_blocks = loader.get_tensor(
163
+ f"model.layers.{i}.mlp.experts.gate_up_proj_blocks"
164
+ )
165
+ gate_up_scales = loader.get_tensor(
166
+ f"model.layers.{i}.mlp.experts.gate_up_proj_scales"
167
+ )
168
+ gate_up_bias = loader.get_tensor(
169
+ f"model.layers.{i}.mlp.experts.gate_up_proj_bias"
170
+ )
171
+
172
+ down_blocks = loader.get_tensor(
173
+ f"model.layers.{i}.mlp.experts.down_proj_blocks"
174
+ )
175
+ down_scales = loader.get_tensor(
176
+ f"model.layers.{i}.mlp.experts.down_proj_scales"
177
+ )
178
+ down_bias = loader.get_tensor(
179
+ f"model.layers.{i}.mlp.experts.down_proj_bias"
180
+ )
181
+
182
+ # Proper MXFP4 dequantization implementation
183
+ def decode_e8m0(scales_8bit: np.ndarray) -> np.ndarray:
184
+ """Decode 8-bit E8M0 floats (power-of-two scale factors)."""
185
+ bias = 127.0
186
+ values = 2.0 ** (scales_8bit.astype(np.float32) - bias)
187
+ return values
188
+
189
+ def dequantize_mxfp4(blocks, scales):
190
+ """Dequantize MXFP4 weights (E2M1 4bit, packed in uint8)."""
191
+
192
+ # Decode scales first
193
+ scales = decode_e8m0(scales)
194
+ num_experts, out_dim, num_blocks, block_size = blocks.shape
195
+
196
+ # Unpack 4bit values from uint8
197
+ blocks_uint8 = blocks.astype(np.uint8)
198
+ high_nibble = (blocks_uint8 >> 4) & 0xF
199
+ low_nibble = blocks_uint8 & 0xF
200
+ # Stack along new last axis
201
+ blocks_4bit = np.stack([low_nibble, high_nibble], axis=-1)
202
+ # Reshape to [num_experts, out_dim, num_blocks, 32 (16*2)]
203
+ blocks_4bit = blocks_4bit.reshape(
204
+ num_experts, out_dim, num_blocks, block_size * 2
205
+ )
206
+
207
+ # Decode E2M1 4bit values
208
+ s = (blocks_4bit >> 3) & 0x1
209
+ e = (blocks_4bit >> 1) & 0x3
210
+ m = blocks_4bit & 0x1
211
+
212
+ bias = 1.0
213
+ sign = 1.0 - 2.0 * s
214
+
215
+ normal_mask = e != 0
216
+
217
+ values = np.empty_like(blocks_4bit, dtype=np.float32)
218
+
219
+ values[normal_mask] = (
220
+ sign[normal_mask]
221
+ * (2.0 ** (e[normal_mask].astype(np.float32) - bias))
222
+ * (1.0 + m[normal_mask].astype(np.float32) / 2.0)
223
+ )
224
+ values[~normal_mask] = (
225
+ sign[~normal_mask]
226
+ * (2.0 ** (1.0 - bias))
227
+ * (m[~normal_mask].astype(np.float32) / 2.0)
228
+ )
229
+
230
+ values = values.reshape(
231
+ num_experts, out_dim, num_blocks * block_size * 2
232
+ )
233
+ # Expand scales to match values shape
234
+ scales_expanded = np.repeat(
235
+ scales[..., np.newaxis], block_size * 2, axis=3
236
+ )
237
+ scales_expanded = scales_expanded.reshape(
238
+ num_experts, out_dim, num_blocks * block_size * 2
239
+ )
240
+ dequantized = values * scales_expanded
241
+
242
+ return dequantized
243
+
244
+ # Dequantize gate_up_proj weights: [32, 90, 16, 16]
245
+ gate_up_dequantized = dequantize_mxfp4(gate_up_blocks, gate_up_scales)
246
+
247
+ # gate_up_dequantized: [32, 90, 256, 16]
248
+ gate_up_proj = np.transpose(gate_up_dequantized, (0, 2, 1))
249
+
250
+ # Dequantize down_proj weights: [32, 2880, 16, 16]
251
+ down_dequantized = dequantize_mxfp4(down_blocks, down_scales)
252
+
253
+ down_proj = np.transpose(down_dequantized, (0, 2, 1))
254
+
255
+ moe_block.experts.gate_up_proj.assign(gate_up_proj)
256
+ moe_block.experts.down_proj.assign(down_proj)
257
+
258
+ # Load biases - reshape to match KerasHub format
259
+ moe_block.experts.gate_up_proj_bias.assign(gate_up_bias)
260
+ moe_block.experts.down_proj_bias.assign(down_bias)
261
+
262
+ # Post-attention layernorm
263
+ loader.port_weight(
264
+ keras_variable=decoder_layer.post_attention_layernorm.scale,
265
+ hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
266
+ )
267
+
268
+ # Final normalization layer
269
+ loader.port_weight(
270
+ keras_variable=backbone.layer_norm.scale,
271
+ hf_weight_key="model.norm.weight",
272
+ )
273
+ return backbone
274
+
275
+
276
+ def convert_tokenizer(cls, preset, **kwargs):
277
+ """Convert a Hugging Face tokenizer to a KerasHub tokenizer."""
278
+
279
+ tokenizer_file = get_file(preset, "tokenizer.json")
280
+ with open(tokenizer_file, "r") as f:
281
+ tokenizer_data = json.load(f)
282
+ vocabulary = tokenizer_data.get("model", {}).get("vocab", {})
283
+ merges = tokenizer_data.get("model", {}).get("merges", [])
284
+ added_tokens = tokenizer_data.get("added_tokens", [])
285
+
286
+ vocab_dict = {}
287
+ for token, token_id in vocabulary.items():
288
+ vocab_dict[token] = int(token_id)
289
+
290
+ for token_info in added_tokens:
291
+ token = token_info.get("content", "")
292
+ token_id = token_info.get("id", 0)
293
+ vocab_dict[token] = int(token_id)
294
+
295
+ merges_strings = []
296
+ for merge in merges:
297
+ if isinstance(merge, list) and len(merge) == 2:
298
+ merges_strings.append(f"{merge[0]} {merge[1]}")
299
+ else:
300
+ merges_strings.append(str(merge))
301
+
302
+ return cls(vocabulary=vocab_dict, merges=merges_strings, **kwargs)
@@ -12,7 +12,9 @@ from keras_hub.src.utils.transformers import convert_dinov3
12
12
  from keras_hub.src.utils.transformers import convert_distilbert
13
13
  from keras_hub.src.utils.transformers import convert_esm
14
14
  from keras_hub.src.utils.transformers import convert_gemma
15
+ from keras_hub.src.utils.transformers import convert_gemma3
15
16
  from keras_hub.src.utils.transformers import convert_gpt2
17
+ from keras_hub.src.utils.transformers import convert_gpt_oss
16
18
  from keras_hub.src.utils.transformers import convert_llama3
17
19
  from keras_hub.src.utils.transformers import convert_mistral
18
20
  from keras_hub.src.utils.transformers import convert_mixtral
@@ -49,8 +51,12 @@ class TransformersPresetLoader(PresetLoader):
49
51
  self.converter = convert_esm
50
52
  elif model_type in ("gemma", "gemma2"):
51
53
  self.converter = convert_gemma
54
+ elif model_type in ("gemma3", "gemma3_text"):
55
+ self.converter = convert_gemma3
52
56
  elif model_type == "gpt2":
53
57
  self.converter = convert_gpt2
58
+ elif model_type == "gpt_oss":
59
+ self.converter = convert_gpt_oss
54
60
  elif model_type == "llama":
55
61
  # TODO: handle other llama versions.
56
62
  self.converter = convert_llama3
@@ -115,5 +121,11 @@ class TransformersPresetLoader(PresetLoader):
115
121
  return self.converter.convert_tokenizer(cls, self.preset, **kwargs)
116
122
 
117
123
  def load_image_converter(self, cls, **kwargs):
124
+ if hasattr(self.converter, "load_image_converter_config"):
125
+ config = self.converter.load_image_converter_config(
126
+ self.preset, self.config
127
+ )
128
+ if config is not None:
129
+ return cls(**{**config, **kwargs})
118
130
  # TODO: set image size for pali gemma checkpoints.
119
131
  return None
keras_hub/src/version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
2
 
3
3
  # Unique source of truth for the version number.
4
- __version__ = "0.24.0.dev202511220420"
4
+ __version__ = "0.26.0.dev202601010440"
5
5
 
6
6
 
7
7
  @keras_hub_export("keras_hub.version")
@@ -47,6 +47,9 @@ from keras_hub.src.models.gpt2.gpt2_tokenizer import (
47
47
  from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import (
48
48
  GPTNeoXTokenizer as GPTNeoXTokenizer,
49
49
  )
50
+ from keras_hub.src.models.gpt_oss.gpt_oss_tokenizer import (
51
+ GptOssTokenizer as GptOssTokenizer,
52
+ )
50
53
  from keras_hub.src.models.llama.llama_tokenizer import (
51
54
  LlamaTokenizer as LlamaTokenizer,
52
55
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: keras-hub-nightly
3
- Version: 0.24.0.dev202511220420
3
+ Version: 0.26.0.dev202601010440
4
4
  Summary: Pretrained models for Keras.
5
5
  Author-email: Keras team <keras-users@googlegroups.com>
6
6
  License-Expression: Apache-2.0
@@ -8,7 +8,6 @@ Project-URL: Home, https://keras.io/keras_hub/
8
8
  Project-URL: Repository, https://github.com/keras-team/keras/keras_hub
9
9
  Classifier: Development Status :: 3 - Alpha
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
11
  Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
14
13
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -18,9 +17,9 @@ Classifier: Operating System :: MacOS
18
17
  Classifier: Intended Audience :: Science/Research
19
18
  Classifier: Topic :: Scientific/Engineering
20
19
  Classifier: Topic :: Software Development
21
- Requires-Python: >=3.10
20
+ Requires-Python: >=3.11
22
21
  Description-Content-Type: text/markdown
23
- Requires-Dist: keras>=3.8
22
+ Requires-Dist: keras>=3.13
24
23
  Requires-Dist: absl-py
25
24
  Requires-Dist: numpy
26
25
  Requires-Dist: packaging
@@ -31,7 +30,7 @@ Requires-Dist: tensorflow-text; platform_system != "Windows"
31
30
 
32
31
  # KerasHub: Multi-framework Pretrained Models
33
32
  [![](https://github.com/keras-team/keras-hub/workflows/Tests/badge.svg?branch=master)](https://github.com/keras-team/keras-hub/actions?query=workflow%3ATests+branch%3Amaster)
34
- ![Python](https://img.shields.io/badge/python-v3.10.0+-success.svg)
33
+ ![Python](https://img.shields.io/badge/python-v3.11.0+-success.svg)
35
34
  [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/keras-team/keras-hub/issues)
36
35
 
37
36
  > [!IMPORTANT]