keras-hub-nightly 0.22.0.dev202505290412__py3-none-any.whl → 0.22.0.dev202505300409__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -444,6 +444,15 @@ from keras_hub.src.models.qwen.qwen_tokenizer import (
444
444
  from keras_hub.src.models.qwen.qwen_tokenizer import (
445
445
  QwenTokenizer as QwenTokenizer,
446
446
  )
447
+ from keras_hub.src.models.qwen3.qwen3_backbone import (
448
+ Qwen3Backbone as Qwen3Backbone,
449
+ )
450
+ from keras_hub.src.models.qwen3.qwen3_causal_lm_preprocessor import (
451
+ Qwen3CausalLMPreprocessor as Qwen3CausalLMPreprocessor,
452
+ )
453
+ from keras_hub.src.models.qwen3.qwen3_tokenizer import (
454
+ Qwen3Tokenizer as Qwen3Tokenizer,
455
+ )
447
456
  from keras_hub.src.models.qwen_moe.qwen_moe_backbone import (
448
457
  QwenMoeBackbone as QwenMoeBackbone,
449
458
  )
@@ -4,8 +4,8 @@ backbone_presets = {
4
4
  "mixtral_8_7b_en": {
5
5
  "metadata": {
6
6
  "description": (
7
- "32-layer Mixtral MoE model with 7 billion",
8
- "active parameters and 8 experts per MoE layer.",
7
+ "32-layer Mixtral MoE model with 7 billion"
8
+ "active parameters and 8 experts per MoE layer."
9
9
  ),
10
10
  "params": 46702792704,
11
11
  "path": "mixtral",
@@ -15,8 +15,8 @@ backbone_presets = {
15
15
  "mixtral_8_instruct_7b_en": {
16
16
  "metadata": {
17
17
  "description": (
18
- "Instruction fine-tuned 32-layer Mixtral MoE model",
19
- "with 7 billion active parameters and 8 experts per MoE layer.",
18
+ "Instruction fine-tuned 32-layer Mixtral MoE model"
19
+ "with 7 billion active parameters and 8 experts per MoE layer."
20
20
  ),
21
21
  "params": 46702792704,
22
22
  "path": "mixtral",
@@ -28,8 +28,8 @@ backbone_presets = {
28
28
  "qwen2.5_instruct_0.5b_en": {
29
29
  "metadata": {
30
30
  "description": (
31
- "Instruction fine-tuned 24-layer Qwen model with 0.5 ",
32
- "billion parameters.",
31
+ "Instruction fine-tuned 24-layer Qwen model with 0.5 "
32
+ "billion parameters."
33
33
  ),
34
34
  "params": 494032768,
35
35
  "path": "qwen",
@@ -39,8 +39,8 @@ backbone_presets = {
39
39
  "qwen2.5_instruct_32b_en": {
40
40
  "metadata": {
41
41
  "description": (
42
- "Instruction fine-tuned 64-layer Qwen model with 32 ",
43
- "billion parameters.",
42
+ "Instruction fine-tuned 64-layer Qwen model with 32 "
43
+ "billion parameters."
44
44
  ),
45
45
  "params": 32763876352,
46
46
  "path": "qwen",
@@ -50,8 +50,8 @@ backbone_presets = {
50
50
  "qwen2.5_instruct_72b_en": {
51
51
  "metadata": {
52
52
  "description": (
53
- "Instruction fine-tuned 80-layer Qwen model with 72 ",
54
- "billion parameters.",
53
+ "Instruction fine-tuned 80-layer Qwen model with 72 "
54
+ "billion parameters."
55
55
  ),
56
56
  "params": 72706203648,
57
57
  "path": "qwen",
@@ -0,0 +1,369 @@
1
+ import math
2
+
3
+ import keras
4
+ from keras import ops
5
+
6
+ from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
7
+ from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
8
+ from keras_hub.src.utils.keras_utils import clone_initializer
9
+ from keras_hub.src.utils.keras_utils import fused_attention_op_available
10
+
11
+
12
+ class Qwen3Attention(keras.layers.Layer):
13
+ """A multi-head attention layer for Qwen3 models
14
+
15
+ This attention implementation supports grouped-query attention (GQA) where
16
+ the number of key-value heads can be less than the number of query heads.
17
+
18
+ Args:
19
+ num_query_heads: Number of query heads.
20
+ num_key_value_heads: Number of key/value heads (for GQA).
21
+ rope_max_wavelength: Maximum wavelength for RoPE (Rotary Position
22
+ Embedding).
23
+ rope_scaling_factor: Scaling factor for RoPE, used for extending
24
+ context length.
25
+ kernel_initializer: Initializer for the kernel weights.
26
+ dropout: Dropout rate for attention weights.
27
+ sliding_window_size: Size of the sliding window for attention.
28
+ **kwargs: Additional keyword arguments to pass to the Layer.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ num_query_heads,
34
+ num_key_value_heads,
35
+ head_dim,
36
+ rope_max_wavelength=10000,
37
+ rope_scaling_factor=1,
38
+ kernel_initializer="glorot_uniform",
39
+ dropout=0.0,
40
+ layer_norm_epsilon=1e-5,
41
+ sliding_window_size=None,
42
+ **kwargs,
43
+ ):
44
+ super().__init__(
45
+ **kwargs,
46
+ )
47
+ self.num_query_heads = num_query_heads
48
+ self.num_key_value_heads = num_key_value_heads
49
+ self.head_dim = head_dim
50
+ self.dropout = dropout
51
+
52
+ self.layer_norm_epsilon = layer_norm_epsilon
53
+
54
+ self.num_key_value_groups = num_query_heads // num_key_value_heads
55
+ self.rope_max_wavelength = rope_max_wavelength
56
+
57
+ self.kernel_initializer = keras.initializers.get(
58
+ clone_initializer(kernel_initializer)
59
+ )
60
+
61
+ self.rope_scaling_factor = rope_scaling_factor
62
+ self.sliding_window_size = sliding_window_size
63
+
64
+ def build(self, inputs_shape):
65
+ # Einsum variables:
66
+ # b = batch size
67
+ # q = query length
68
+ # k = key/value length
69
+ # m = model dim
70
+ # u = num query heads
71
+ # v = num key/value heads
72
+ # h = head dim
73
+ hidden_dim = inputs_shape[-1]
74
+ if not self.head_dim:
75
+ self.head_dim = hidden_dim // self.num_query_heads
76
+
77
+ self._inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
78
+ self._query_dense = keras.layers.EinsumDense(
79
+ equation="bqm,muh->bquh",
80
+ output_shape=(None, self.num_query_heads, self.head_dim),
81
+ kernel_initializer=self.kernel_initializer,
82
+ dtype=self.dtype_policy,
83
+ name="query",
84
+ )
85
+ self._query_dense.build(inputs_shape)
86
+
87
+ self._query_dense_layer_norm = Qwen3LayerNorm(
88
+ epsilon=self.layer_norm_epsilon,
89
+ dtype=self.dtype_policy,
90
+ head_dim=self.head_dim,
91
+ name="query_dense_layernorm",
92
+ )
93
+ self._query_dense_layer_norm.build(inputs_shape)
94
+
95
+ self._key_dense = keras.layers.EinsumDense(
96
+ equation="bkm,mvh->bkvh",
97
+ output_shape=(
98
+ None,
99
+ self.num_key_value_heads,
100
+ self.head_dim,
101
+ ),
102
+ kernel_initializer=self.kernel_initializer,
103
+ dtype=self.dtype_policy,
104
+ name="key",
105
+ )
106
+ self._key_dense.build(inputs_shape)
107
+
108
+ self._key_dense_layer_norm = Qwen3LayerNorm(
109
+ epsilon=self.layer_norm_epsilon,
110
+ dtype=self.dtype_policy,
111
+ head_dim=self.head_dim,
112
+ name="key_dense_layernorm",
113
+ )
114
+ self._key_dense_layer_norm.build(inputs_shape)
115
+
116
+ self._value_dense = keras.layers.EinsumDense(
117
+ equation="bkm,mvh->bkvh",
118
+ output_shape=(
119
+ None,
120
+ self.num_key_value_heads,
121
+ self.head_dim,
122
+ ),
123
+ kernel_initializer=self.kernel_initializer,
124
+ dtype=self.dtype_policy,
125
+ name="value",
126
+ )
127
+ self._value_dense.build(inputs_shape)
128
+
129
+ self._softmax = keras.layers.Softmax(
130
+ axis=-1,
131
+ dtype="float32",
132
+ name="attention_softmax",
133
+ )
134
+
135
+ self._dropout_layer = keras.layers.Dropout(
136
+ rate=self.dropout,
137
+ dtype=self.dtype_policy,
138
+ )
139
+
140
+ self._output_dense = keras.layers.EinsumDense(
141
+ equation="bquh,uhm->bqm",
142
+ output_shape=(None, hidden_dim),
143
+ kernel_initializer=self.kernel_initializer,
144
+ dtype=self.dtype_policy,
145
+ name="attention_output",
146
+ )
147
+ self._output_dense.build(
148
+ (None, None, self.num_query_heads, self.head_dim)
149
+ )
150
+
151
+ self.rotary_embedding_layer = RotaryEmbedding(
152
+ max_wavelength=self.rope_max_wavelength,
153
+ scaling_factor=self.rope_scaling_factor,
154
+ dtype=self.dtype_policy,
155
+ )
156
+
157
+ self._dot_product_equation = "bquh,bkuh->buqk"
158
+ self._combine_equation = "buqk,bkuh->bquh"
159
+
160
+ self.built = True
161
+
162
+ def call(
163
+ self,
164
+ hidden_states,
165
+ attention_mask=None,
166
+ cache=None,
167
+ cache_update_index=None,
168
+ training=None,
169
+ ):
170
+ """Applies attention mechanism to the input hidden states.
171
+
172
+ Args:
173
+ hidden_states: Input tensor of shape [batch_size, seq_length,
174
+ hidden_size].
175
+ attention_mask: Mask tensor of shape [batch_size, seq_length,
176
+ seq_length].
177
+ cache: Optional cached key and value tensors.
178
+ cache_update_index: Index at which to update the cache.
179
+ training: Boolean indicating whether in training mode.
180
+
181
+ Returns:
182
+ attention_output: Output tensor after applying attention.
183
+ cache: Updated cache tensors (if cache is provided).
184
+ """
185
+ start_index = (
186
+ cache_update_index if cache_update_index is not None else 0
187
+ )
188
+
189
+ query = self._query_dense(hidden_states)
190
+ query = self._query_dense_layer_norm(query)
191
+
192
+ # Compute RoPE for queries
193
+ query = self.rotary_embedding_layer(query, start_index=start_index)
194
+
195
+ def _compute_key_value(x):
196
+ key = self._key_dense(x)
197
+ key = self._key_dense_layer_norm(key)
198
+ key = self.rotary_embedding_layer(key, start_index=start_index)
199
+
200
+ value = self._value_dense(x)
201
+
202
+ return key, value
203
+
204
+ if cache is not None:
205
+ key_cache = cache[:, 0, ...]
206
+ value_cache = cache[:, 1, ...]
207
+ if cache_update_index is None:
208
+ key = key_cache
209
+ value = value_cache
210
+ else:
211
+ key_update, value_update = _compute_key_value(hidden_states)
212
+ start = [0, cache_update_index, 0, 0]
213
+ key = ops.slice_update(key_cache, start, key_update)
214
+ value = ops.slice_update(value_cache, start, value_update)
215
+ cache = ops.stack((key, value), axis=1)
216
+ else:
217
+ if cache_update_index is not None:
218
+ raise ValueError(
219
+ "`cache_update_index` should not be set if `cache` is "
220
+ f"`None`. Received: cache={cache}, "
221
+ f"cache_update_index={cache_update_index}"
222
+ )
223
+ key, value = _compute_key_value(hidden_states)
224
+
225
+ # [batch_shape, seq_len, num_key_value_heads, head_dim]
226
+ # -> [batch_shape, seq_len, num_heads, head_dim]
227
+ key = ops.repeat(key, repeats=self.num_key_value_groups, axis=2)
228
+ value = ops.repeat(value, repeats=self.num_key_value_groups, axis=2)
229
+
230
+ attention_output = self._compute_attention(
231
+ query,
232
+ key,
233
+ value,
234
+ attention_mask,
235
+ cache_update_index=cache_update_index,
236
+ )
237
+
238
+ attention_output = self._dropout_layer(
239
+ attention_output, training=training
240
+ )
241
+
242
+ attention_output = self._output_dense(attention_output)
243
+
244
+ if cache is not None:
245
+ return attention_output, cache
246
+ return attention_output
247
+
248
+ def _masked_softmax(self, attention_scores, attention_mask=None):
249
+ """Applies softmax with optional masking.
250
+
251
+ Args:
252
+ attention_scores: Attention score tensor.
253
+ attention_mask: Optional mask tensor.
254
+
255
+ Returns:
256
+ Masked softmax attention weights.
257
+ """
258
+ if attention_mask is not None:
259
+ return self._softmax(
260
+ attention_scores, attention_mask[:, None, :, :]
261
+ )
262
+ return self._softmax(attention_scores)
263
+
264
+ def _compute_attention(
265
+ self, query, key, value, attention_mask=None, cache_update_index=None
266
+ ):
267
+ """Computes attention using query, key, and value tensors.
268
+
269
+ Uses Flash Attention when available for better performance.
270
+
271
+ Args:
272
+ query: Query tensor.
273
+ key: Key tensor.
274
+ value: Value tensor.
275
+ attention_mask: Optional mask tensor.
276
+ cache_update_index: Index for sliding window computation.
277
+
278
+ Returns:
279
+ attention_output: Output tensor after applying attention.
280
+ """
281
+ if fused_attention_op_available():
282
+ # Use `dot_product_attention` with Flash Attention support if
283
+ # available.
284
+ if attention_mask is not None:
285
+ attention_mask = ops.expand_dims(attention_mask, axis=1)
286
+ attention_mask = ops.cast(attention_mask, dtype="bool")
287
+ attention_output = ops.dot_product_attention(
288
+ query,
289
+ key,
290
+ value,
291
+ mask=attention_mask,
292
+ scale=self._inv_norm_factor,
293
+ )
294
+ return attention_output
295
+
296
+ attention_scores = ops.einsum(self._dot_product_equation, query, key)
297
+
298
+ attention_scores = ops.multiply(
299
+ attention_scores,
300
+ ops.cast(self._inv_norm_factor, self.compute_dtype),
301
+ )
302
+ if not self.sliding_window_size:
303
+ attention_mask = self._mask_sliding_window(
304
+ attention_mask,
305
+ cache_update_index=cache_update_index
306
+ if cache_update_index
307
+ else 0,
308
+ )
309
+ attention_scores = self._masked_softmax(
310
+ attention_scores, attention_mask
311
+ )
312
+ attention_scores = ops.cast(attention_scores, self.compute_dtype)
313
+ attention_output = ops.einsum(
314
+ self._combine_equation, attention_scores, value
315
+ )
316
+
317
+ return attention_output
318
+
319
+ def _mask_sliding_window(
320
+ self,
321
+ attention_mask,
322
+ cache_update_index=0,
323
+ ):
324
+ """Creates and combines a sliding window mask with the attention mask.
325
+
326
+ Args:
327
+ attention_mask: Original attention mask.
328
+ cache_update_index: Starting index for the sliding window.
329
+
330
+ Returns:
331
+ Combined attention mask with sliding window constraints.
332
+ """
333
+ _, query_len, key_len = ops.shape(attention_mask)
334
+ # Compute the sliding window for square attention.
335
+ all_ones = ops.ones((key_len, key_len), "bool")
336
+ if keras.config.backend() == "tensorflow":
337
+ # TODO: trui/tril has issues with dynamic shape on the tensorflow
338
+ # backend. We should fix, but use `band_part` for now.
339
+ import tensorflow as tf
340
+
341
+ band_size = ops.minimum(key_len, self.sliding_window_size - 1)
342
+ band_size = ops.cast(band_size, "int32")
343
+ sliding_mask = tf.linalg.band_part(all_ones, band_size, band_size)
344
+ else:
345
+ sliding_mask = ops.triu(
346
+ all_ones, -1 * self.sliding_window_size + 1
347
+ ) * ops.tril(all_ones, self.sliding_window_size - 1)
348
+ # Slice the window for short queries during generation.
349
+ start = (cache_update_index, 0)
350
+ sliding_mask = ops.slice(sliding_mask, start, (query_len, key_len))
351
+ sliding_mask = ops.expand_dims(sliding_mask, 0)
352
+ return ops.logical_and(attention_mask, ops.cast(sliding_mask, "bool"))
353
+
354
+ def get_config(self):
355
+ config = super().get_config()
356
+ config.update(
357
+ {
358
+ "num_query_heads": self.num_query_heads,
359
+ "num_key_value_heads": self.num_key_value_heads,
360
+ "rope_max_wavelength": self.rope_max_wavelength,
361
+ "rope_scaling_factor": self.rope_scaling_factor,
362
+ "kernel_initializer": keras.initializers.serialize(
363
+ self.kernel_initializer
364
+ ),
365
+ "dropout": self.dropout,
366
+ "sliding_window_size": self.sliding_window_size,
367
+ }
368
+ )
369
+ return config
@@ -0,0 +1,191 @@
1
+ import keras
2
+ from keras import ops
3
+
4
+ from keras_hub.src.api_export import keras_hub_export
5
+ from keras_hub.src.layers.modeling.reversible_embedding import (
6
+ ReversibleEmbedding,
7
+ )
8
+ from keras_hub.src.models.backbone import Backbone
9
+ from keras_hub.src.models.qwen3.qwen3_decoder import Qwen3TransformerDecoder
10
+ from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
11
+
12
+
13
+ def _qwen3_kernel_initializer(stddev=0.02):
14
+ return keras.initializers.RandomNormal(stddev=stddev)
15
+
16
+
17
+ @keras_hub_export("keras_hub.models.Qwen3Backbone")
18
+ class Qwen3Backbone(Backbone):
19
+ """The Qwen3 Transformer core architecture with hyperparameters.
20
+
21
+ This network implements a Transformer-based decoder network,
22
+ Qwen3, as described in the Qwen3 model architecture.
23
+ It includes the embedding lookups and transformer layers.
24
+
25
+ The default constructor gives a fully customizable, randomly initialized
26
+ Qwen3 model with any number of layers, heads, and embedding
27
+ dimensions. To load preset architectures and weights, use the `from_preset`
28
+ constructor.
29
+
30
+ Args:
31
+ vocabulary_size (int): The size of the token vocabulary.
32
+ num_layers (int): The number of transformer layers.
33
+ num_query_heads (int): The number of query attention heads for
34
+ each transformer.
35
+ hidden_dim (int): The size of the transformer encoding and pooling
36
+ layers.
37
+ intermediate_dim (int): The output dimension of the first Dense layer in
38
+ a three-layer feedforward network for each transformer.
39
+ num_key_value_heads (int): The number of key and value attention heads
40
+ for each transformer.
41
+ rope_max_wavelength (int, optional): The maximum angular wavelength of
42
+ the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
43
+ rope_scaling_factor (float, optional): The scaling factor for
44
+ calculation of rotary embedding. Defaults to `1.0`.
45
+ layer_norm_epsilon (float, optional): Epsilon for the layer
46
+ normalization layers in the transformer decoder. Defaults to `1e-6`.
47
+ dropout (float, optional): Dropout rate for attention and hidden layers.
48
+ Defaults to `0`.
49
+ dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
50
+ for model computations and weights. Note that some computations,
51
+ such as softmax and layer normalization, will always be done at
52
+ float32 precision regardless of dtype.
53
+ tie_word_embeddings (bool, optional): Whether to tie input and output
54
+ embeddings. Defaults to `True`.
55
+ sliding_window_size (int, optional): Size of the sliding window for
56
+ attention when enabled. Defaults to `32768`.
57
+
58
+ Examples:
59
+
60
+ ```python
61
+ input_data = {
62
+ "token_ids": np.ones(shape=(1, 12), dtype="int32"),
63
+ "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
64
+ }
65
+
66
+ # Pretrained Qwen3 decoder.
67
+ model = keras_hub.models.Qwen3Backbone.from_preset("qwen32.5_0.5b_en")
68
+ model(input_data)
69
+
70
+ # Randomly initialized Qwen3 decoder with custom config.
71
+ model = keras_hub.models.Qwen3Backbone(
72
+ vocabulary_size=10,
73
+ hidden_dim=512,
74
+ num_layers=2,
75
+ num_query_heads=32,
76
+ num_key_value_heads=8,
77
+ intermediate_dim=1024,
78
+ layer_norm_epsilon=1e-6,
79
+ dtype="float32"
80
+ )
81
+ model(input_data)
82
+ ```
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ vocabulary_size,
88
+ num_layers,
89
+ num_query_heads,
90
+ num_key_value_heads,
91
+ head_dim,
92
+ hidden_dim,
93
+ intermediate_dim,
94
+ rope_max_wavelength=10000,
95
+ rope_scaling_factor=1.0,
96
+ layer_norm_epsilon=1e-6,
97
+ dropout=0.0,
98
+ tie_word_embeddings=True,
99
+ sliding_window_size=32768,
100
+ dtype=None,
101
+ **kwargs,
102
+ ):
103
+ # === Layers ===
104
+ self.token_embedding = ReversibleEmbedding(
105
+ input_dim=vocabulary_size,
106
+ output_dim=hidden_dim,
107
+ tie_weights=tie_word_embeddings,
108
+ embeddings_initializer=_qwen3_kernel_initializer(stddev=0.01),
109
+ dtype=dtype,
110
+ name="token_embedding",
111
+ )
112
+ self.transformer_layers = []
113
+ for i in range(num_layers):
114
+ layer = Qwen3TransformerDecoder(
115
+ intermediate_dim=intermediate_dim,
116
+ head_dim=head_dim,
117
+ num_query_heads=num_query_heads,
118
+ num_key_value_heads=num_key_value_heads,
119
+ rope_max_wavelength=rope_max_wavelength,
120
+ rope_scaling_factor=rope_scaling_factor,
121
+ layer_norm_epsilon=layer_norm_epsilon,
122
+ activation=ops.silu,
123
+ kernel_initializer=_qwen3_kernel_initializer(stddev=0.02),
124
+ dropout=dropout,
125
+ sliding_window_size=sliding_window_size,
126
+ dtype=dtype,
127
+ name=f"transformer_layer_{i}",
128
+ )
129
+ self.transformer_layers.append(layer)
130
+ self.layer_norm = Qwen3LayerNorm(
131
+ epsilon=layer_norm_epsilon,
132
+ dtype=dtype,
133
+ name="sequence_output_layernorm",
134
+ )
135
+
136
+ # === Functional Model ===
137
+ token_id_input = keras.Input(
138
+ shape=(None,), dtype="int32", name="token_ids"
139
+ )
140
+ padding_mask_input = keras.Input(
141
+ shape=(None,), dtype="int32", name="padding_mask"
142
+ )
143
+ x = self.token_embedding(token_id_input)
144
+ for transformer_layer in self.transformer_layers:
145
+ x = transformer_layer(x, decoder_padding_mask=padding_mask_input)
146
+ sequence_output = self.layer_norm(x)
147
+ super().__init__(
148
+ inputs={
149
+ "token_ids": token_id_input,
150
+ "padding_mask": padding_mask_input,
151
+ },
152
+ outputs=sequence_output,
153
+ dtype=dtype,
154
+ **kwargs,
155
+ )
156
+
157
+ # === Config ===
158
+ self.vocabulary_size = vocabulary_size
159
+ self.num_layers = num_layers
160
+ self.num_query_heads = num_query_heads
161
+ self.hidden_dim = hidden_dim
162
+ self.head_dim = head_dim
163
+ self.intermediate_dim = intermediate_dim
164
+ self.rope_max_wavelength = rope_max_wavelength
165
+ self.num_key_value_heads = num_key_value_heads
166
+ self.rope_scaling_factor = rope_scaling_factor
167
+ self.layer_norm_epsilon = layer_norm_epsilon
168
+ self.dropout = dropout
169
+ self.tie_word_embeddings = tie_word_embeddings
170
+ self.sliding_window_size = sliding_window_size
171
+
172
+ def get_config(self):
173
+ config = super().get_config()
174
+ config.update(
175
+ {
176
+ "vocabulary_size": self.vocabulary_size,
177
+ "num_layers": self.num_layers,
178
+ "num_query_heads": self.num_query_heads,
179
+ "hidden_dim": self.hidden_dim,
180
+ "head_dim": self.head_dim,
181
+ "intermediate_dim": self.intermediate_dim,
182
+ "rope_max_wavelength": self.rope_max_wavelength,
183
+ "rope_scaling_factor": self.rope_scaling_factor,
184
+ "num_key_value_heads": self.num_key_value_heads,
185
+ "layer_norm_epsilon": self.layer_norm_epsilon,
186
+ "dropout": self.dropout,
187
+ "tie_word_embeddings": self.tie_word_embeddings,
188
+ "sliding_window_size": self.sliding_window_size,
189
+ }
190
+ )
191
+ return config
@@ -0,0 +1,10 @@
1
+ from keras_hub.src.api_export import keras_hub_export
2
+ from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
3
+ from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
4
+ from keras_hub.src.models.qwen3.qwen3_tokenizer import Qwen3Tokenizer
5
+
6
+
7
+ @keras_hub_export("keras_hub.models.Qwen3CausalLMPreprocessor")
8
+ class Qwen3CausalLMPreprocessor(CausalLMPreprocessor):
9
+ backbone_cls = Qwen3Backbone
10
+ tokenizer_cls = Qwen3Tokenizer
@@ -0,0 +1,309 @@
1
+ import keras
2
+ from keras import ops
3
+
4
+ from keras_hub.src.layers.modeling.transformer_layer_utils import (
5
+ compute_causal_mask,
6
+ )
7
+ from keras_hub.src.layers.modeling.transformer_layer_utils import (
8
+ merge_padding_and_attention_mask,
9
+ )
10
+ from keras_hub.src.models.qwen3.qwen3_attention import Qwen3Attention
11
+ from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
12
+ from keras_hub.src.utils.keras_utils import clone_initializer
13
+
14
+
15
+ class Qwen3TransformerDecoder(keras.layers.Layer):
16
+ """A Transformer decoder layer for the Qwen3 backbone.
17
+
18
+ This layer implements a Transformer decoder block that includes
19
+ self-attention with optional sliding window attention and a feed-forward
20
+ network.
21
+
22
+ Args:
23
+ intermediate_dim: Output dimension of the first dense layer in the
24
+ feed-forward network.
25
+ num_query_heads: Number of query attention heads.
26
+ num_key_value_heads: Number of key/value attention heads (for GQA).
27
+ rope_max_wavelength: Maximum wavelength for RoPE (Rotary Position
28
+ Embedding).
29
+ rope_scaling_factor: Scaling factor for RoPE, used for extending
30
+ context length.
31
+ activation: Activation function to use in the feed-forward network.
32
+ layer_norm_epsilon: Small float added to variance to avoid dividing
33
+ by zero in layer norm.
34
+ kernel_initializer: Initializer for the kernel weights.
35
+ dropout: Dropout rate for attention and hidden layers.
36
+ sliding_window_size: Size of the sliding window for attention when
37
+ enabled.
38
+ **kwargs: Additional keyword arguments to pass to the Layer.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ intermediate_dim,
44
+ num_query_heads,
45
+ num_key_value_heads,
46
+ head_dim,
47
+ rope_max_wavelength=10000,
48
+ rope_scaling_factor=1.0,
49
+ activation="silu",
50
+ layer_norm_epsilon=1e-5,
51
+ kernel_initializer="glorot_uniform",
52
+ dropout=0.0,
53
+ sliding_window_size=None,
54
+ **kwargs,
55
+ ):
56
+ super().__init__(**kwargs)
57
+ self.intermediate_dim = intermediate_dim
58
+ self.num_query_heads = num_query_heads
59
+ self.num_key_value_heads = num_key_value_heads
60
+ self.head_dim = head_dim
61
+
62
+ self.rope_max_wavelength = rope_max_wavelength
63
+ self.rope_scaling_factor = rope_scaling_factor
64
+
65
+ self.dropout = dropout
66
+
67
+ self.sliding_window_size = sliding_window_size
68
+
69
+ self.activation = keras.activations.get(activation)
70
+ self.layer_norm_epsilon = layer_norm_epsilon
71
+ self.kernel_initializer = keras.initializers.get(kernel_initializer)
72
+
73
+ self.supports_masking = True
74
+
75
+ def build(self, decoder_sequence_shape):
76
+ self._decoder_sequence_shape = decoder_sequence_shape
77
+ self.hidden_dim = decoder_sequence_shape[-1]
78
+
79
+ # Self attention layer.
80
+ self._self_attention_layer = Qwen3Attention(
81
+ num_query_heads=self.num_query_heads,
82
+ num_key_value_heads=self.num_key_value_heads,
83
+ rope_max_wavelength=self.rope_max_wavelength,
84
+ head_dim=self.head_dim,
85
+ rope_scaling_factor=self.rope_scaling_factor,
86
+ kernel_initializer=clone_initializer(self.kernel_initializer),
87
+ dropout=self.dropout,
88
+ sliding_window_size=self.sliding_window_size,
89
+ dtype=self.dtype_policy,
90
+ name="self_attention",
91
+ )
92
+ self._self_attention_layer.build(decoder_sequence_shape)
93
+
94
+ self._self_attention_layernorm = Qwen3LayerNorm(
95
+ epsilon=self.layer_norm_epsilon,
96
+ dtype=self.dtype_policy,
97
+ name="self_attention_layernorm",
98
+ )
99
+
100
+ self._self_attention_layernorm.build(decoder_sequence_shape)
101
+ self._self_attention_dropout = keras.layers.Dropout(
102
+ rate=self.dropout,
103
+ dtype=self.dtype_policy,
104
+ name="self_attention_dropout",
105
+ )
106
+
107
+ # Feedforward layers.
108
+ self._feedforward_intermediate_dense = keras.layers.Dense(
109
+ self.intermediate_dim,
110
+ kernel_initializer=clone_initializer(self.kernel_initializer),
111
+ use_bias=False,
112
+ dtype=self.dtype_policy,
113
+ name="feedforward_intermediate_dense",
114
+ )
115
+ self._feedforward_intermediate_dense.build(decoder_sequence_shape)
116
+
117
+ self._feedforward_gate_dense = keras.layers.Dense(
118
+ self.intermediate_dim,
119
+ kernel_initializer=clone_initializer(self.kernel_initializer),
120
+ use_bias=False,
121
+ dtype=self.dtype_policy,
122
+ name="feedforward_gate_dense",
123
+ )
124
+ self._feedforward_gate_dense.build(decoder_sequence_shape)
125
+
126
+ self._feedforward_output_dense = keras.layers.Dense(
127
+ self.hidden_dim,
128
+ kernel_initializer=clone_initializer(self.kernel_initializer),
129
+ use_bias=False,
130
+ dtype=self.dtype_policy,
131
+ name="feedforward_output_dense",
132
+ )
133
+
134
+ self._feedforward_output_dense.build(
135
+ self._feedforward_gate_dense.compute_output_shape(
136
+ decoder_sequence_shape
137
+ )
138
+ )
139
+
140
+ self._feedforward_layernorm = Qwen3LayerNorm(
141
+ epsilon=self.layer_norm_epsilon,
142
+ dtype=self.dtype_policy,
143
+ name="feedforward_layernorm",
144
+ )
145
+ self._feedforward_layernorm.build(decoder_sequence_shape)
146
+
147
+ self.built = True
148
+
149
+ def call(
150
+ self,
151
+ decoder_sequence,
152
+ decoder_padding_mask=None,
153
+ decoder_attention_mask=None,
154
+ self_attention_cache=None,
155
+ self_attention_cache_update_index=None,
156
+ training=None,
157
+ ):
158
+ """Forward pass for the decoder layer.
159
+
160
+ Args:
161
+ decoder_sequence: Input tensor of shape [batch_size, seq_length,
162
+ hidden_size].
163
+ decoder_padding_mask: Mask tensor for padding tokens.
164
+ decoder_attention_mask: Additional attention mask.
165
+ self_attention_cache: Optional cached key and value tensors for
166
+ self-attention.
167
+ self_attention_cache_update_index: Index at which to update the
168
+ cache.
169
+ training: Boolean indicating whether in training mode.
170
+
171
+ Returns:
172
+ decoder_output: Output tensor after applying transformer decoder
173
+ block.
174
+ self_attention_cache: Updated cache tensors (if cache is provided).
175
+ """
176
+ self_attention_mask = self._compute_self_attention_mask(
177
+ decoder_sequence=decoder_sequence,
178
+ decoder_padding_mask=decoder_padding_mask,
179
+ decoder_attention_mask=decoder_attention_mask,
180
+ self_attention_cache=self_attention_cache,
181
+ self_attention_cache_update_index=self_attention_cache_update_index,
182
+ )
183
+ residual = decoder_sequence
184
+
185
+ x = self._self_attention_layernorm(decoder_sequence)
186
+
187
+ # Self attention block.
188
+ x = self._self_attention_layer(
189
+ hidden_states=x,
190
+ attention_mask=self_attention_mask,
191
+ cache=self_attention_cache,
192
+ cache_update_index=self_attention_cache_update_index,
193
+ )
194
+
195
+ if self_attention_cache is not None:
196
+ x, self_attention_cache = x
197
+
198
+ x = self._self_attention_dropout(x, training=training)
199
+
200
+ x = x + residual
201
+ residual = x
202
+
203
+ x = self._feedforward_layernorm(x)
204
+ gate_output = self._feedforward_gate_dense(x)
205
+
206
+ # Note that we run the activation function in full 32-bit
207
+ # precision since this is what `torch.nn.functional.silu`
208
+ # does. Internally, `torch.nn.functional.silu` converts the
209
+ # inputs to float32, computes SiLU, and converts the outputs
210
+ # back to compute dtype.
211
+ # CPU Kernel: https://github.com/pytorch/pytorch/blob/35c493f2cf9b623bfdc7e6b34dc1cb39690a7919/aten/src/ATen/native/cpu/Activation.cpp#L1221-L1235 # noqa: E501
212
+ # CUDA Kernel: https://github.com/pytorch/pytorch/blob/35c493f2cf9b623bfdc7e6b34dc1cb39690a7919/aten/src/ATen/native/cuda/ActivationSiluKernel.cu # noqa: E501
213
+ gate_output = ops.cast(gate_output, "float32")
214
+ gate_output = self.activation(gate_output)
215
+ gate_output = ops.cast(gate_output, self.compute_dtype)
216
+
217
+ x = self._feedforward_intermediate_dense(x)
218
+
219
+ x = self._feedforward_output_dense(ops.multiply(x, gate_output))
220
+
221
+ decoder_output = x + residual
222
+
223
+ if self_attention_cache is not None:
224
+ return decoder_output, self_attention_cache
225
+ return decoder_output
226
+
227
+ def _compute_self_attention_mask(
228
+ self,
229
+ decoder_sequence,
230
+ decoder_padding_mask,
231
+ decoder_attention_mask,
232
+ self_attention_cache,
233
+ self_attention_cache_update_index,
234
+ ):
235
+ """Computes the self-attention mask combining causal, padding and
236
+ attention masks.
237
+
238
+ Args:
239
+ decoder_sequence: Input tensor.
240
+ decoder_padding_mask: Mask tensor for padding tokens.
241
+ decoder_attention_mask: Additional attention mask.
242
+ self_attention_cache: Optional cached key and value tensors.
243
+ self_attention_cache_update_index: Index at which to update the
244
+ cache.
245
+
246
+ Returns:
247
+ Combined attention mask tensor.
248
+ """
249
+ decoder_mask = merge_padding_and_attention_mask(
250
+ decoder_sequence, decoder_padding_mask, decoder_attention_mask
251
+ )
252
+ batch_size = ops.shape(decoder_sequence)[0]
253
+ input_length = output_length = ops.shape(decoder_sequence)[1]
254
+ # We need to handle a rectangular causal mask when doing cached
255
+ # decoding. For generative inference, `decoder_sequence` will
256
+ # generally be length 1, and `cache` will be the full generation length.
257
+ if self_attention_cache is not None:
258
+ input_length = ops.shape(self_attention_cache)[2]
259
+
260
+ cache_update_index = (
261
+ 0
262
+ if self_attention_cache_update_index is None
263
+ else self_attention_cache_update_index
264
+ )
265
+
266
+ causal_mask = compute_causal_mask(
267
+ batch_size, input_length, output_length, cache_update_index
268
+ )
269
+
270
+ return (
271
+ ops.minimum(decoder_mask, causal_mask)
272
+ if decoder_mask is not None
273
+ else causal_mask
274
+ )
275
+
276
+ def compute_output_shape(self, decoder_sequence_shape):
277
+ """Computes the output shape of the layer.
278
+
279
+ Args:
280
+ decoder_sequence_shape: Shape of the decoder sequence input.
281
+
282
+ Returns:
283
+ Output shape, which is the same as the input shape.
284
+ """
285
+ return decoder_sequence_shape
286
+
287
+ def get_config(self):
288
+ """Returns the config of the layer.
289
+
290
+ Returns:
291
+ Dictionary containing the parameters used to initialize this layer.
292
+ """
293
+ config = super().get_config()
294
+ config.update(
295
+ {
296
+ "intermediate_dim": self.intermediate_dim,
297
+ "num_query_heads": self.num_query_heads,
298
+ "rope_max_wavelength": self.rope_max_wavelength,
299
+ "rope_scaling_factor": self.rope_scaling_factor,
300
+ "num_key_value_heads": self.num_key_value_heads,
301
+ "activation": keras.activations.serialize(self.activation),
302
+ "layer_norm_epsilon": self.layer_norm_epsilon,
303
+ "kernel_initializer": keras.initializers.serialize(
304
+ self.kernel_initializer
305
+ ),
306
+ "dropout": self.dropout,
307
+ }
308
+ )
309
+ return config
@@ -0,0 +1,38 @@
1
+ import keras
2
+ from keras import ops
3
+
4
+
5
+ class Qwen3LayerNorm(keras.layers.Layer):
6
+ """A normalization layer for Qwen that implements RMS normalization."""
7
+
8
+ def __init__(self, head_dim=None, epsilon=1e-6, **kwargs):
9
+ super().__init__(**kwargs)
10
+ self.head_dim = head_dim
11
+ self.epsilon = epsilon
12
+
13
+ def build(self, input_shape):
14
+ if self.head_dim:
15
+ dim = self.head_dim
16
+ else:
17
+ dim = input_shape[-1]
18
+
19
+ self.scale = self.add_weight(
20
+ name="scale",
21
+ trainable=True,
22
+ shape=(dim,),
23
+ initializer="ones",
24
+ dtype=self.variable_dtype,
25
+ )
26
+ self.built = True
27
+
28
+ def call(self, x):
29
+ input_dtype = x.dtype
30
+ x = ops.cast(x, "float32")
31
+ var = ops.mean(ops.power(x, 2), axis=-1, keepdims=True)
32
+ x = x * ops.rsqrt(var + self.epsilon)
33
+ return ops.cast(x * self.scale, input_dtype)
34
+
35
+ def get_config(self):
36
+ config = super().get_config()
37
+ config.update({"epsilon": self.epsilon})
38
+ return config
@@ -0,0 +1,48 @@
1
+ from keras_hub.src.api_export import keras_hub_export
2
+ from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
3
+ from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
4
+
5
+
6
+ @keras_hub_export(
7
+ "keras_hub.models.Qwen3Tokenizer",
8
+ )
9
+ class Qwen3Tokenizer(BytePairTokenizer):
10
+ """Tokenizer for Qwen3 models.
11
+
12
+ This tokenizer implements byte-pair encoding (BPE) for Qwen3 models,
13
+ handling special tokens like BOS (beginning of sequence) and EOS (end of
14
+ sequence).
15
+
16
+ Args:
17
+ vocabulary: Dictionary mapping tokens to token IDs, or path to
18
+ vocabulary file.
19
+ merges: List of BPE merges, or path to merges file.
20
+ bos_token: Beginning of sequence token. Defaults to None.
21
+ eos_token: End of sequence token. Defaults to "<|endoftext|>".
22
+ misc_special_tokens: Set of additional special tokens. Defaults to
23
+ empty set.
24
+ """
25
+
26
+ backbone_cls = Qwen3Backbone
27
+
28
+ def __init__(
29
+ self,
30
+ vocabulary=None,
31
+ merges=None,
32
+ **kwargs,
33
+ ):
34
+ # Add EOS token
35
+ eos_token = "<|im_end|>"
36
+ self._add_special_token(eos_token, "end_token")
37
+
38
+ pad_token = "<|endoftext|>"
39
+ self._add_special_token(pad_token, "pad_token")
40
+
41
+ self.start_token_id = None
42
+ self.start_token = None
43
+
44
+ super().__init__(
45
+ vocabulary=vocabulary,
46
+ merges=merges,
47
+ **kwargs,
48
+ )
@@ -4,8 +4,8 @@ backbone_presets = {
4
4
  "qwen1.5_moe_2.7b_en": {
5
5
  "metadata": {
6
6
  "description": (
7
- "24-layer Qwen MoE model with 2.7 billion active parameters ",
8
- "and 8 experts per MoE layer.",
7
+ "24-layer Qwen MoE model with 2.7 billion active parameters "
8
+ "and 8 experts per MoE layer."
9
9
  ),
10
10
  "params": 14315784192,
11
11
  "path": "qwen-1.5-moe",
@@ -0,0 +1,145 @@
1
+ import numpy as np
2
+
3
+ from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
4
+ from keras_hub.src.utils.preset_utils import load_json
5
+
6
+ backbone_cls = Qwen3Backbone
7
+
8
+
9
+ def convert_backbone_config(transformers_config):
10
+ return {
11
+ "vocabulary_size": transformers_config["vocab_size"],
12
+ "head_dim": transformers_config["head_dim"],
13
+ "hidden_dim": transformers_config["hidden_size"],
14
+ "num_layers": transformers_config["num_hidden_layers"],
15
+ "num_query_heads": transformers_config["num_attention_heads"],
16
+ "num_key_value_heads": transformers_config["num_key_value_heads"],
17
+ "intermediate_dim": transformers_config["intermediate_size"],
18
+ "layer_norm_epsilon": transformers_config["rms_norm_eps"],
19
+ "rope_max_wavelength": transformers_config["rope_theta"],
20
+ "sliding_window_size": transformers_config["sliding_window"]
21
+ if transformers_config["use_sliding_window"]
22
+ else None,
23
+ "tie_word_embeddings": transformers_config["tie_word_embeddings"],
24
+ }
25
+
26
+
27
+ def convert_weights(backbone, loader, transformers_config):
28
+ loader.port_weight(
29
+ keras_variable=backbone.get_layer("token_embedding").embeddings,
30
+ hf_weight_key="model.embed_tokens.weight",
31
+ )
32
+ if not backbone.tie_word_embeddings:
33
+ loader.port_weight(
34
+ keras_variable=backbone.get_layer(
35
+ "token_embedding"
36
+ ).reverse_embeddings,
37
+ hf_weight_key="lm_head.weight",
38
+ # rearrange_pattern="b a -> a b",
39
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
40
+ )
41
+
42
+ def transpose_and_reshape(x, shape):
43
+ return np.reshape(np.transpose(x), shape)
44
+
45
+ for i in range(backbone.num_layers):
46
+ decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
47
+
48
+ # Input layernorm
49
+ loader.port_weight(
50
+ keras_variable=decoder_layer._self_attention_layernorm.scale,
51
+ hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
52
+ )
53
+
54
+ # Attention layers
55
+
56
+ ## Query
57
+ loader.port_weight(
58
+ keras_variable=decoder_layer._self_attention_layer._query_dense.kernel,
59
+ hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
60
+ hook_fn=transpose_and_reshape,
61
+ )
62
+ loader.port_weight(
63
+ keras_variable=decoder_layer._self_attention_layer._query_dense_layer_norm.scale,
64
+ hf_weight_key=f"model.layers.{i}.self_attn.q_norm.weight",
65
+ )
66
+ ## Key
67
+ loader.port_weight(
68
+ keras_variable=decoder_layer._self_attention_layer._key_dense.kernel,
69
+ hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
70
+ hook_fn=transpose_and_reshape,
71
+ )
72
+ loader.port_weight(
73
+ keras_variable=decoder_layer._self_attention_layer._key_dense_layer_norm.scale,
74
+ hf_weight_key=f"model.layers.{i}.self_attn.k_norm.weight",
75
+ )
76
+ ## Value
77
+ loader.port_weight(
78
+ keras_variable=decoder_layer._self_attention_layer._value_dense.kernel,
79
+ hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
80
+ hook_fn=transpose_and_reshape,
81
+ )
82
+ ## Output
83
+ loader.port_weight(
84
+ keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
85
+ hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
86
+ # rearrange_patterns="c (a b) -> a b c",
87
+ # rearrange_dims={"a": backbone.num_query_heads},
88
+ hook_fn=transpose_and_reshape,
89
+ )
90
+
91
+ # MLP layers
92
+ loader.port_weight(
93
+ keras_variable=decoder_layer._feedforward_intermediate_dense.kernel,
94
+ hf_weight_key=f"model.layers.{i}.mlp.up_proj.weight",
95
+ # rearrange_patterns="b a -> a b",
96
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
97
+ )
98
+ loader.port_weight(
99
+ keras_variable=decoder_layer._feedforward_output_dense.kernel,
100
+ hf_weight_key=f"model.layers.{i}.mlp.down_proj.weight",
101
+ # rearrange_patterns="b a -> a b",
102
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
103
+ )
104
+ loader.port_weight(
105
+ keras_variable=decoder_layer._feedforward_gate_dense.kernel,
106
+ hf_weight_key=f"model.layers.{i}.mlp.gate_proj.weight",
107
+ # rearrange_patterns="b a -> a b",
108
+ hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
109
+ )
110
+
111
+ # Feedforward layernorm
112
+ loader.port_weight(
113
+ keras_variable=decoder_layer._feedforward_layernorm.scale,
114
+ hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
115
+ )
116
+
117
+ # Final normalization layer
118
+ loader.port_weight(
119
+ keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
120
+ hf_weight_key="model.norm.weight",
121
+ )
122
+
123
+ return backbone
124
+
125
+
126
+ def convert_tokenizer(cls, preset, **kwargs):
127
+ tokenizer_config = load_json(preset, "tokenizer.json")
128
+ vocab = tokenizer_config["model"]["vocab"]
129
+ merges = tokenizer_config["model"]["merges"]
130
+ merges = [" ".join(item) for item in merges]
131
+
132
+ # Load all special tokens with the exception of "reserved" ones.
133
+ special_tokens = set()
134
+ for token in tokenizer_config["added_tokens"]:
135
+ if not token["content"].startswith("<|reserved_special_token_"):
136
+ vocab[token["content"]] = token["id"]
137
+ special_tokens.add(token["content"])
138
+
139
+ kwargs.update(
140
+ {
141
+ "unsplittable_tokens": list(special_tokens),
142
+ }
143
+ )
144
+
145
+ return cls(vocabulary=vocab, merges=merges, **kwargs)
@@ -14,6 +14,7 @@ from keras_hub.src.utils.transformers import convert_mistral
14
14
  from keras_hub.src.utils.transformers import convert_mixtral
15
15
  from keras_hub.src.utils.transformers import convert_pali_gemma
16
16
  from keras_hub.src.utils.transformers import convert_qwen
17
+ from keras_hub.src.utils.transformers import convert_qwen3
17
18
  from keras_hub.src.utils.transformers import convert_qwen_moe
18
19
  from keras_hub.src.utils.transformers import convert_vit
19
20
  from keras_hub.src.utils.transformers.safetensor_utils import SafetensorLoader
@@ -50,6 +51,8 @@ class TransformersPresetLoader(PresetLoader):
50
51
  self.converter = convert_mixtral
51
52
  elif model_type == "qwen2_moe":
52
53
  self.converter = convert_qwen_moe
54
+ elif model_type == "qwen3":
55
+ self.converter = convert_qwen3
53
56
  else:
54
57
  raise ValueError(
55
58
  "KerasHub has no converter for huggingface/transformers models "
keras_hub/src/version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
2
 
3
3
  # Unique source of truth for the version number.
4
- __version__ = "0.22.0.dev202505290412"
4
+ __version__ = "0.22.0.dev202505300409"
5
5
 
6
6
 
7
7
  @keras_hub_export("keras_hub.version")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: keras-hub-nightly
3
- Version: 0.22.0.dev202505290412
3
+ Version: 0.22.0.dev202505300409
4
4
  Summary: Pretrained models for Keras.
5
5
  Author-email: Keras team <keras-users@googlegroups.com>
6
6
  License-Expression: Apache-2.0
@@ -1,11 +1,11 @@
1
1
  keras_hub/__init__.py,sha256=bJbUZkqwhZvTb1Tqx1fbkq6mzBYiEyq-Hin3oQIkhdE,558
2
2
  keras_hub/layers/__init__.py,sha256=gnvT-GuASB1hZwY4zrRkLs5yohSQu9Pp1SHDxsWPLY8,5081
3
3
  keras_hub/metrics/__init__.py,sha256=KYalsMPBnfwim9BdGHFfJ5WxUKFXOQ1QoKIMT_0lwlM,439
4
- keras_hub/models/__init__.py,sha256=itSzodVUeuX6HQnmsSXY0Wv-5Htbu397410R-SFW_4I,26411
4
+ keras_hub/models/__init__.py,sha256=1ZKgLK4AZ44s_cH7vu6FvmVocxf0biLAnY_lEh3dgxw,26734
5
5
  keras_hub/samplers/__init__.py,sha256=aFQIkiqbZpi8vjrPp2MVII4QUfE-eQjra5fMeHsoy7k,886
6
6
  keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
8
- keras_hub/src/version.py,sha256=DDvaRSyKJcjRMYdIJIroiLLIbnEZPXF5mlsR_VQNowQ,222
8
+ keras_hub/src/version.py,sha256=hJyx_F3-Sy3RRaPET6xBnbg7QRtPkFgRHC4_SaxL3bw,222
9
9
  keras_hub/src/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  keras_hub/src/layers/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  keras_hub/src/layers/modeling/alibi_bias.py,sha256=1XBTHI52L_iJDhN_w5ydu_iMhCuTgQAxEPwcLA6BPuk,4411
@@ -259,7 +259,7 @@ keras_hub/src/models/mixtral/mixtral_causal_lm.py,sha256=JA1t6xTeaYX_fNo9ftRyvzd
259
259
  keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py,sha256=q2qXa9QAUWBvOWv9DeNvwsBNXSORJAbQFoQsWQ7e8V8,3079
260
260
  keras_hub/src/models/mixtral/mixtral_decoder.py,sha256=CvOjhTxPnGQ_HNknZXRI6Cx1kpuHG99_TiOh-mNcsDw,18190
261
261
  keras_hub/src/models/mixtral/mixtral_layer_norm.py,sha256=zfbDKZEb45FTwP0zQd7WPPp8tuiGoSNfS-DRYWkZyWw,1031
262
- keras_hub/src/models/mixtral/mixtral_presets.py,sha256=AteLrYXyVjooz_DHLnBA1OMlZS6LMu7Y7gGUWddn6go,856
262
+ keras_hub/src/models/mixtral/mixtral_presets.py,sha256=pi5hHcwVSqr7ytf4dSnU_ew_t7NYw7EsZrmklQDqDVo,852
263
263
  keras_hub/src/models/mixtral/mixtral_tokenizer.py,sha256=Kc233k879QMyX164X_CzWbqpnqEkKWNqa648guTGkBk,661
264
264
  keras_hub/src/models/mobilenet/__init__.py,sha256=hxkNGGj_iAMu62iooUDEPA818sNOIgjG7pXMLEMOsAE,275
265
265
  keras_hub/src/models/mobilenet/mobilenet_backbone.py,sha256=aZBSFeLUObYYoi3od9DI1KfgPCqh5GHTcAI8Y2ZHShA,29536
@@ -311,8 +311,14 @@ keras_hub/src/models/qwen/qwen_causal_lm.py,sha256=_f-UHaKHp0ncxknpkpEJiW3jlng3E
311
311
  keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py,sha256=Va-4TLJD3ycEnkS41rF3dVj4_6K0j-gxLTrREFRcyr0,609
312
312
  keras_hub/src/models/qwen/qwen_decoder.py,sha256=utmAvZlU7_nP-6pjGPDinK4JaMzsQSwOARG0ote-jAg,11771
313
313
  keras_hub/src/models/qwen/qwen_layernorm.py,sha256=DS35r3qd6g5ocL7Nhf_vNzLLMo1aI9VCSmL64dgNOYI,924
314
- keras_hub/src/models/qwen/qwen_presets.py,sha256=DpRplWNwktM4KDgIP495PTUBJxQE_mS6KQSK5LGWOyc,1998
314
+ keras_hub/src/models/qwen/qwen_presets.py,sha256=1FkKV6M3yqJz4EP1xa7bEvfIQ721xXT-_ikjWX0xvww,1992
315
315
  keras_hub/src/models/qwen/qwen_tokenizer.py,sha256=LCv3IyiDDHqVnM9N3lf5-BE3iwicIh0nKS1hjoPw9lE,1532
316
+ keras_hub/src/models/qwen3/qwen3_attention.py,sha256=sewLjli290XvJ1efGZJEAYqUZfRll7cmhu0258s4C48,13042
317
+ keras_hub/src/models/qwen3/qwen3_backbone.py,sha256=Ylpk_rRWWRxy8irlAPjJU-YrxYGpo8c9lSEO1zZl4gU,7456
318
+ keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py,sha256=H4g-bgvuhAUnDwjJovydK16Kes38ZFZWPvflrgHqZis,458
319
+ keras_hub/src/models/qwen3/qwen3_decoder.py,sha256=68s9jQj53zFmXE4-SGXKYHu546fXOyi9LUbnKk-HGYY,11595
320
+ keras_hub/src/models/qwen3/qwen3_layernorm.py,sha256=EJxjf7Pr6ufPQnNeuYQxkExzPjPk4PQxqMsoBeSEkDo,1073
321
+ keras_hub/src/models/qwen3/qwen3_tokenizer.py,sha256=LmPtg0vprMchDvYfTj8m5PraXI2QS3-YgdIIpIm5iAs,1448
316
322
  keras_hub/src/models/qwen_moe/__init__.py,sha256=5D8GUmVDsJs0J4sVZHcXOLkZf12U96l-WtwyVee4lu8,267
317
323
  keras_hub/src/models/qwen_moe/qwen_moe_attention.py,sha256=pE79_iHUm2LGkoWL6zMJw_pNfzIvmyq3yJaiq47W2TY,13242
318
324
  keras_hub/src/models/qwen_moe/qwen_moe_backbone.py,sha256=nrfELvIvRLmrgKrUNXci2CrecmeI6bWzJj7HH-RcWJA,15341
@@ -320,7 +326,7 @@ keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py,sha256=MeP60v7GcN_SmH5_ULRpq
320
326
  keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py,sha256=uKaXRrJs02vkVudjdehzJPp0B84tPMkxNHlp166kceE,589
321
327
  keras_hub/src/models/qwen_moe/qwen_moe_decoder.py,sha256=kmUjLpYTbJQ3J_31qWhLOd0Dg2_9cl_JX_zM8ZMH1Qo,23130
322
328
  keras_hub/src/models/qwen_moe/qwen_moe_layernorm.py,sha256=DbkWJo7U0-cwdZwHPeAnFznYwtao6o0fjpoDJ9UWnpc,927
323
- keras_hub/src/models/qwen_moe/qwen_moe_presets.py,sha256=uKrA9xLV3P3jtYUUsqdhKq_HPkB4lXmOYseB1wXTZnI,457
329
+ keras_hub/src/models/qwen_moe/qwen_moe_presets.py,sha256=LhOA3Ow-z3cNTan4AOrtyCXS58EgfvO_gtqiZt5cUQc,455
324
330
  keras_hub/src/models/qwen_moe/qwen_moe_tokenizer.py,sha256=2c3X8jNGO0q0UL5NtUqSgHWLqhyJGi2ohNcTeOGhd84,1407
325
331
  keras_hub/src/models/resnet/__init__.py,sha256=C5UqlQ6apm8WSp1bnrxB6Bi3BGaknxRQs-r3b2wpaGA,257
326
332
  keras_hub/src/models/resnet/resnet_backbone.py,sha256=Q7nlqcTXZzjqd0e-DsjHC4ok58yOX7qxseotym3uZpM,31276
@@ -496,13 +502,14 @@ keras_hub/src/utils/transformers/convert_mistral.py,sha256=kVhN9h1ZFVhwkNW8p3wnS
496
502
  keras_hub/src/utils/transformers/convert_mixtral.py,sha256=PxeCY8Xe7U_caICugwOCEjuSZ51ZUtmef6rUxh-Wt54,5508
497
503
  keras_hub/src/utils/transformers/convert_pali_gemma.py,sha256=B1leeDw96Yvu81hYumf66hIid07k5NLqoeWAJgPnaLs,10649
498
504
  keras_hub/src/utils/transformers/convert_qwen.py,sha256=WUxMAEFVqRs7TRw7QU5TH3_ev4yf02R1xFVliMvTQqg,5886
505
+ keras_hub/src/utils/transformers/convert_qwen3.py,sha256=LIormvCMWPq6X9Wo2eNbADjtFZ0nI7tFGZFBxmo4GKw,5700
499
506
  keras_hub/src/utils/transformers/convert_qwen_moe.py,sha256=a7R28aln-PdAcNuKAXdrtzvslho2Co6GypChxLMKPpc,10618
500
507
  keras_hub/src/utils/transformers/convert_vit.py,sha256=9SUZ9utNJhW_5cj3acMn9cRy47u2eIcDsrhmzj77o9k,5187
501
- keras_hub/src/utils/transformers/preset_loader.py,sha256=1nfS5xVsl-JROGXJXltTqV1fQdcUlZbGGcbf-n79pXM,4225
508
+ keras_hub/src/utils/transformers/preset_loader.py,sha256=7tFnbyAiUCMcTG8VQ7Wpi-J7cvRoSZn-ZYE_l0xuh0M,4363
502
509
  keras_hub/src/utils/transformers/safetensor_utils.py,sha256=CYUHyA4y-B61r7NDnCsFb4t_UmSwZ1k9L-8gzEd6KRg,3339
503
510
  keras_hub/tokenizers/__init__.py,sha256=uMjjm0mzUkRb0e4Ac_JK8aJ9cKGUi5UqmzWoWAFJprE,4164
504
511
  keras_hub/utils/__init__.py,sha256=jXPqVGBpJr_PpYmqD8aDG-fRMlxH-ulqCR2SZMn288Y,646
505
- keras_hub_nightly-0.22.0.dev202505290412.dist-info/METADATA,sha256=W4vT73-ho1j4QwQv59qS5xF4i6bWH5k7tHiUJ7-_y4k,7393
506
- keras_hub_nightly-0.22.0.dev202505290412.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
507
- keras_hub_nightly-0.22.0.dev202505290412.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
508
- keras_hub_nightly-0.22.0.dev202505290412.dist-info/RECORD,,
512
+ keras_hub_nightly-0.22.0.dev202505300409.dist-info/METADATA,sha256=hH3xqnggYJvyKQ7DG5U0pJyM8umkP1oRPj32GKEu1E8,7393
513
+ keras_hub_nightly-0.22.0.dev202505300409.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
514
+ keras_hub_nightly-0.22.0.dev202505300409.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
515
+ keras_hub_nightly-0.22.0.dev202505300409.dist-info/RECORD,,