keras-hub-nightly 0.22.0.dev202505290412__py3-none-any.whl → 0.22.0.dev202505300409__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/models/__init__.py +9 -0
- keras_hub/src/models/mixtral/mixtral_presets.py +4 -4
- keras_hub/src/models/qwen/qwen_presets.py +6 -6
- keras_hub/src/models/qwen3/qwen3_attention.py +369 -0
- keras_hub/src/models/qwen3/qwen3_backbone.py +191 -0
- keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py +10 -0
- keras_hub/src/models/qwen3/qwen3_decoder.py +309 -0
- keras_hub/src/models/qwen3/qwen3_layernorm.py +38 -0
- keras_hub/src/models/qwen3/qwen3_tokenizer.py +48 -0
- keras_hub/src/models/qwen_moe/qwen_moe_presets.py +2 -2
- keras_hub/src/utils/transformers/convert_qwen3.py +145 -0
- keras_hub/src/utils/transformers/preset_loader.py +3 -0
- keras_hub/src/version.py +1 -1
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505300409.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505300409.dist-info}/RECORD +17 -10
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505300409.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.22.0.dev202505290412.dist-info → keras_hub_nightly-0.22.0.dev202505300409.dist-info}/top_level.txt +0 -0
keras_hub/models/__init__.py
CHANGED
@@ -444,6 +444,15 @@ from keras_hub.src.models.qwen.qwen_tokenizer import (
|
|
444
444
|
from keras_hub.src.models.qwen.qwen_tokenizer import (
|
445
445
|
QwenTokenizer as QwenTokenizer,
|
446
446
|
)
|
447
|
+
from keras_hub.src.models.qwen3.qwen3_backbone import (
|
448
|
+
Qwen3Backbone as Qwen3Backbone,
|
449
|
+
)
|
450
|
+
from keras_hub.src.models.qwen3.qwen3_causal_lm_preprocessor import (
|
451
|
+
Qwen3CausalLMPreprocessor as Qwen3CausalLMPreprocessor,
|
452
|
+
)
|
453
|
+
from keras_hub.src.models.qwen3.qwen3_tokenizer import (
|
454
|
+
Qwen3Tokenizer as Qwen3Tokenizer,
|
455
|
+
)
|
447
456
|
from keras_hub.src.models.qwen_moe.qwen_moe_backbone import (
|
448
457
|
QwenMoeBackbone as QwenMoeBackbone,
|
449
458
|
)
|
@@ -4,8 +4,8 @@ backbone_presets = {
|
|
4
4
|
"mixtral_8_7b_en": {
|
5
5
|
"metadata": {
|
6
6
|
"description": (
|
7
|
-
"32-layer Mixtral MoE model with 7 billion"
|
8
|
-
"active parameters and 8 experts per MoE layer."
|
7
|
+
"32-layer Mixtral MoE model with 7 billion"
|
8
|
+
"active parameters and 8 experts per MoE layer."
|
9
9
|
),
|
10
10
|
"params": 46702792704,
|
11
11
|
"path": "mixtral",
|
@@ -15,8 +15,8 @@ backbone_presets = {
|
|
15
15
|
"mixtral_8_instruct_7b_en": {
|
16
16
|
"metadata": {
|
17
17
|
"description": (
|
18
|
-
"Instruction fine-tuned 32-layer Mixtral MoE model"
|
19
|
-
"with 7 billion active parameters and 8 experts per MoE layer."
|
18
|
+
"Instruction fine-tuned 32-layer Mixtral MoE model"
|
19
|
+
"with 7 billion active parameters and 8 experts per MoE layer."
|
20
20
|
),
|
21
21
|
"params": 46702792704,
|
22
22
|
"path": "mixtral",
|
@@ -28,8 +28,8 @@ backbone_presets = {
|
|
28
28
|
"qwen2.5_instruct_0.5b_en": {
|
29
29
|
"metadata": {
|
30
30
|
"description": (
|
31
|
-
"Instruction fine-tuned 24-layer Qwen model with 0.5 "
|
32
|
-
"billion parameters."
|
31
|
+
"Instruction fine-tuned 24-layer Qwen model with 0.5 "
|
32
|
+
"billion parameters."
|
33
33
|
),
|
34
34
|
"params": 494032768,
|
35
35
|
"path": "qwen",
|
@@ -39,8 +39,8 @@ backbone_presets = {
|
|
39
39
|
"qwen2.5_instruct_32b_en": {
|
40
40
|
"metadata": {
|
41
41
|
"description": (
|
42
|
-
"Instruction fine-tuned 64-layer Qwen model with 32 "
|
43
|
-
"billion parameters."
|
42
|
+
"Instruction fine-tuned 64-layer Qwen model with 32 "
|
43
|
+
"billion parameters."
|
44
44
|
),
|
45
45
|
"params": 32763876352,
|
46
46
|
"path": "qwen",
|
@@ -50,8 +50,8 @@ backbone_presets = {
|
|
50
50
|
"qwen2.5_instruct_72b_en": {
|
51
51
|
"metadata": {
|
52
52
|
"description": (
|
53
|
-
"Instruction fine-tuned 80-layer Qwen model with 72 "
|
54
|
-
"billion parameters."
|
53
|
+
"Instruction fine-tuned 80-layer Qwen model with 72 "
|
54
|
+
"billion parameters."
|
55
55
|
),
|
56
56
|
"params": 72706203648,
|
57
57
|
"path": "qwen",
|
@@ -0,0 +1,369 @@
|
|
1
|
+
import math
|
2
|
+
|
3
|
+
import keras
|
4
|
+
from keras import ops
|
5
|
+
|
6
|
+
from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
|
7
|
+
from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
|
8
|
+
from keras_hub.src.utils.keras_utils import clone_initializer
|
9
|
+
from keras_hub.src.utils.keras_utils import fused_attention_op_available
|
10
|
+
|
11
|
+
|
12
|
+
class Qwen3Attention(keras.layers.Layer):
|
13
|
+
"""A multi-head attention layer for Qwen3 models
|
14
|
+
|
15
|
+
This attention implementation supports grouped-query attention (GQA) where
|
16
|
+
the number of key-value heads can be less than the number of query heads.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
num_query_heads: Number of query heads.
|
20
|
+
num_key_value_heads: Number of key/value heads (for GQA).
|
21
|
+
rope_max_wavelength: Maximum wavelength for RoPE (Rotary Position
|
22
|
+
Embedding).
|
23
|
+
rope_scaling_factor: Scaling factor for RoPE, used for extending
|
24
|
+
context length.
|
25
|
+
kernel_initializer: Initializer for the kernel weights.
|
26
|
+
dropout: Dropout rate for attention weights.
|
27
|
+
sliding_window_size: Size of the sliding window for attention.
|
28
|
+
**kwargs: Additional keyword arguments to pass to the Layer.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
num_query_heads,
|
34
|
+
num_key_value_heads,
|
35
|
+
head_dim,
|
36
|
+
rope_max_wavelength=10000,
|
37
|
+
rope_scaling_factor=1,
|
38
|
+
kernel_initializer="glorot_uniform",
|
39
|
+
dropout=0.0,
|
40
|
+
layer_norm_epsilon=1e-5,
|
41
|
+
sliding_window_size=None,
|
42
|
+
**kwargs,
|
43
|
+
):
|
44
|
+
super().__init__(
|
45
|
+
**kwargs,
|
46
|
+
)
|
47
|
+
self.num_query_heads = num_query_heads
|
48
|
+
self.num_key_value_heads = num_key_value_heads
|
49
|
+
self.head_dim = head_dim
|
50
|
+
self.dropout = dropout
|
51
|
+
|
52
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
53
|
+
|
54
|
+
self.num_key_value_groups = num_query_heads // num_key_value_heads
|
55
|
+
self.rope_max_wavelength = rope_max_wavelength
|
56
|
+
|
57
|
+
self.kernel_initializer = keras.initializers.get(
|
58
|
+
clone_initializer(kernel_initializer)
|
59
|
+
)
|
60
|
+
|
61
|
+
self.rope_scaling_factor = rope_scaling_factor
|
62
|
+
self.sliding_window_size = sliding_window_size
|
63
|
+
|
64
|
+
def build(self, inputs_shape):
|
65
|
+
# Einsum variables:
|
66
|
+
# b = batch size
|
67
|
+
# q = query length
|
68
|
+
# k = key/value length
|
69
|
+
# m = model dim
|
70
|
+
# u = num query heads
|
71
|
+
# v = num key/value heads
|
72
|
+
# h = head dim
|
73
|
+
hidden_dim = inputs_shape[-1]
|
74
|
+
if not self.head_dim:
|
75
|
+
self.head_dim = hidden_dim // self.num_query_heads
|
76
|
+
|
77
|
+
self._inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
|
78
|
+
self._query_dense = keras.layers.EinsumDense(
|
79
|
+
equation="bqm,muh->bquh",
|
80
|
+
output_shape=(None, self.num_query_heads, self.head_dim),
|
81
|
+
kernel_initializer=self.kernel_initializer,
|
82
|
+
dtype=self.dtype_policy,
|
83
|
+
name="query",
|
84
|
+
)
|
85
|
+
self._query_dense.build(inputs_shape)
|
86
|
+
|
87
|
+
self._query_dense_layer_norm = Qwen3LayerNorm(
|
88
|
+
epsilon=self.layer_norm_epsilon,
|
89
|
+
dtype=self.dtype_policy,
|
90
|
+
head_dim=self.head_dim,
|
91
|
+
name="query_dense_layernorm",
|
92
|
+
)
|
93
|
+
self._query_dense_layer_norm.build(inputs_shape)
|
94
|
+
|
95
|
+
self._key_dense = keras.layers.EinsumDense(
|
96
|
+
equation="bkm,mvh->bkvh",
|
97
|
+
output_shape=(
|
98
|
+
None,
|
99
|
+
self.num_key_value_heads,
|
100
|
+
self.head_dim,
|
101
|
+
),
|
102
|
+
kernel_initializer=self.kernel_initializer,
|
103
|
+
dtype=self.dtype_policy,
|
104
|
+
name="key",
|
105
|
+
)
|
106
|
+
self._key_dense.build(inputs_shape)
|
107
|
+
|
108
|
+
self._key_dense_layer_norm = Qwen3LayerNorm(
|
109
|
+
epsilon=self.layer_norm_epsilon,
|
110
|
+
dtype=self.dtype_policy,
|
111
|
+
head_dim=self.head_dim,
|
112
|
+
name="key_dense_layernorm",
|
113
|
+
)
|
114
|
+
self._key_dense_layer_norm.build(inputs_shape)
|
115
|
+
|
116
|
+
self._value_dense = keras.layers.EinsumDense(
|
117
|
+
equation="bkm,mvh->bkvh",
|
118
|
+
output_shape=(
|
119
|
+
None,
|
120
|
+
self.num_key_value_heads,
|
121
|
+
self.head_dim,
|
122
|
+
),
|
123
|
+
kernel_initializer=self.kernel_initializer,
|
124
|
+
dtype=self.dtype_policy,
|
125
|
+
name="value",
|
126
|
+
)
|
127
|
+
self._value_dense.build(inputs_shape)
|
128
|
+
|
129
|
+
self._softmax = keras.layers.Softmax(
|
130
|
+
axis=-1,
|
131
|
+
dtype="float32",
|
132
|
+
name="attention_softmax",
|
133
|
+
)
|
134
|
+
|
135
|
+
self._dropout_layer = keras.layers.Dropout(
|
136
|
+
rate=self.dropout,
|
137
|
+
dtype=self.dtype_policy,
|
138
|
+
)
|
139
|
+
|
140
|
+
self._output_dense = keras.layers.EinsumDense(
|
141
|
+
equation="bquh,uhm->bqm",
|
142
|
+
output_shape=(None, hidden_dim),
|
143
|
+
kernel_initializer=self.kernel_initializer,
|
144
|
+
dtype=self.dtype_policy,
|
145
|
+
name="attention_output",
|
146
|
+
)
|
147
|
+
self._output_dense.build(
|
148
|
+
(None, None, self.num_query_heads, self.head_dim)
|
149
|
+
)
|
150
|
+
|
151
|
+
self.rotary_embedding_layer = RotaryEmbedding(
|
152
|
+
max_wavelength=self.rope_max_wavelength,
|
153
|
+
scaling_factor=self.rope_scaling_factor,
|
154
|
+
dtype=self.dtype_policy,
|
155
|
+
)
|
156
|
+
|
157
|
+
self._dot_product_equation = "bquh,bkuh->buqk"
|
158
|
+
self._combine_equation = "buqk,bkuh->bquh"
|
159
|
+
|
160
|
+
self.built = True
|
161
|
+
|
162
|
+
def call(
|
163
|
+
self,
|
164
|
+
hidden_states,
|
165
|
+
attention_mask=None,
|
166
|
+
cache=None,
|
167
|
+
cache_update_index=None,
|
168
|
+
training=None,
|
169
|
+
):
|
170
|
+
"""Applies attention mechanism to the input hidden states.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
hidden_states: Input tensor of shape [batch_size, seq_length,
|
174
|
+
hidden_size].
|
175
|
+
attention_mask: Mask tensor of shape [batch_size, seq_length,
|
176
|
+
seq_length].
|
177
|
+
cache: Optional cached key and value tensors.
|
178
|
+
cache_update_index: Index at which to update the cache.
|
179
|
+
training: Boolean indicating whether in training mode.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
attention_output: Output tensor after applying attention.
|
183
|
+
cache: Updated cache tensors (if cache is provided).
|
184
|
+
"""
|
185
|
+
start_index = (
|
186
|
+
cache_update_index if cache_update_index is not None else 0
|
187
|
+
)
|
188
|
+
|
189
|
+
query = self._query_dense(hidden_states)
|
190
|
+
query = self._query_dense_layer_norm(query)
|
191
|
+
|
192
|
+
# Compute RoPE for queries
|
193
|
+
query = self.rotary_embedding_layer(query, start_index=start_index)
|
194
|
+
|
195
|
+
def _compute_key_value(x):
|
196
|
+
key = self._key_dense(x)
|
197
|
+
key = self._key_dense_layer_norm(key)
|
198
|
+
key = self.rotary_embedding_layer(key, start_index=start_index)
|
199
|
+
|
200
|
+
value = self._value_dense(x)
|
201
|
+
|
202
|
+
return key, value
|
203
|
+
|
204
|
+
if cache is not None:
|
205
|
+
key_cache = cache[:, 0, ...]
|
206
|
+
value_cache = cache[:, 1, ...]
|
207
|
+
if cache_update_index is None:
|
208
|
+
key = key_cache
|
209
|
+
value = value_cache
|
210
|
+
else:
|
211
|
+
key_update, value_update = _compute_key_value(hidden_states)
|
212
|
+
start = [0, cache_update_index, 0, 0]
|
213
|
+
key = ops.slice_update(key_cache, start, key_update)
|
214
|
+
value = ops.slice_update(value_cache, start, value_update)
|
215
|
+
cache = ops.stack((key, value), axis=1)
|
216
|
+
else:
|
217
|
+
if cache_update_index is not None:
|
218
|
+
raise ValueError(
|
219
|
+
"`cache_update_index` should not be set if `cache` is "
|
220
|
+
f"`None`. Received: cache={cache}, "
|
221
|
+
f"cache_update_index={cache_update_index}"
|
222
|
+
)
|
223
|
+
key, value = _compute_key_value(hidden_states)
|
224
|
+
|
225
|
+
# [batch_shape, seq_len, num_key_value_heads, head_dim]
|
226
|
+
# -> [batch_shape, seq_len, num_heads, head_dim]
|
227
|
+
key = ops.repeat(key, repeats=self.num_key_value_groups, axis=2)
|
228
|
+
value = ops.repeat(value, repeats=self.num_key_value_groups, axis=2)
|
229
|
+
|
230
|
+
attention_output = self._compute_attention(
|
231
|
+
query,
|
232
|
+
key,
|
233
|
+
value,
|
234
|
+
attention_mask,
|
235
|
+
cache_update_index=cache_update_index,
|
236
|
+
)
|
237
|
+
|
238
|
+
attention_output = self._dropout_layer(
|
239
|
+
attention_output, training=training
|
240
|
+
)
|
241
|
+
|
242
|
+
attention_output = self._output_dense(attention_output)
|
243
|
+
|
244
|
+
if cache is not None:
|
245
|
+
return attention_output, cache
|
246
|
+
return attention_output
|
247
|
+
|
248
|
+
def _masked_softmax(self, attention_scores, attention_mask=None):
|
249
|
+
"""Applies softmax with optional masking.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
attention_scores: Attention score tensor.
|
253
|
+
attention_mask: Optional mask tensor.
|
254
|
+
|
255
|
+
Returns:
|
256
|
+
Masked softmax attention weights.
|
257
|
+
"""
|
258
|
+
if attention_mask is not None:
|
259
|
+
return self._softmax(
|
260
|
+
attention_scores, attention_mask[:, None, :, :]
|
261
|
+
)
|
262
|
+
return self._softmax(attention_scores)
|
263
|
+
|
264
|
+
def _compute_attention(
|
265
|
+
self, query, key, value, attention_mask=None, cache_update_index=None
|
266
|
+
):
|
267
|
+
"""Computes attention using query, key, and value tensors.
|
268
|
+
|
269
|
+
Uses Flash Attention when available for better performance.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
query: Query tensor.
|
273
|
+
key: Key tensor.
|
274
|
+
value: Value tensor.
|
275
|
+
attention_mask: Optional mask tensor.
|
276
|
+
cache_update_index: Index for sliding window computation.
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
attention_output: Output tensor after applying attention.
|
280
|
+
"""
|
281
|
+
if fused_attention_op_available():
|
282
|
+
# Use `dot_product_attention` with Flash Attention support if
|
283
|
+
# available.
|
284
|
+
if attention_mask is not None:
|
285
|
+
attention_mask = ops.expand_dims(attention_mask, axis=1)
|
286
|
+
attention_mask = ops.cast(attention_mask, dtype="bool")
|
287
|
+
attention_output = ops.dot_product_attention(
|
288
|
+
query,
|
289
|
+
key,
|
290
|
+
value,
|
291
|
+
mask=attention_mask,
|
292
|
+
scale=self._inv_norm_factor,
|
293
|
+
)
|
294
|
+
return attention_output
|
295
|
+
|
296
|
+
attention_scores = ops.einsum(self._dot_product_equation, query, key)
|
297
|
+
|
298
|
+
attention_scores = ops.multiply(
|
299
|
+
attention_scores,
|
300
|
+
ops.cast(self._inv_norm_factor, self.compute_dtype),
|
301
|
+
)
|
302
|
+
if not self.sliding_window_size:
|
303
|
+
attention_mask = self._mask_sliding_window(
|
304
|
+
attention_mask,
|
305
|
+
cache_update_index=cache_update_index
|
306
|
+
if cache_update_index
|
307
|
+
else 0,
|
308
|
+
)
|
309
|
+
attention_scores = self._masked_softmax(
|
310
|
+
attention_scores, attention_mask
|
311
|
+
)
|
312
|
+
attention_scores = ops.cast(attention_scores, self.compute_dtype)
|
313
|
+
attention_output = ops.einsum(
|
314
|
+
self._combine_equation, attention_scores, value
|
315
|
+
)
|
316
|
+
|
317
|
+
return attention_output
|
318
|
+
|
319
|
+
def _mask_sliding_window(
|
320
|
+
self,
|
321
|
+
attention_mask,
|
322
|
+
cache_update_index=0,
|
323
|
+
):
|
324
|
+
"""Creates and combines a sliding window mask with the attention mask.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
attention_mask: Original attention mask.
|
328
|
+
cache_update_index: Starting index for the sliding window.
|
329
|
+
|
330
|
+
Returns:
|
331
|
+
Combined attention mask with sliding window constraints.
|
332
|
+
"""
|
333
|
+
_, query_len, key_len = ops.shape(attention_mask)
|
334
|
+
# Compute the sliding window for square attention.
|
335
|
+
all_ones = ops.ones((key_len, key_len), "bool")
|
336
|
+
if keras.config.backend() == "tensorflow":
|
337
|
+
# TODO: trui/tril has issues with dynamic shape on the tensorflow
|
338
|
+
# backend. We should fix, but use `band_part` for now.
|
339
|
+
import tensorflow as tf
|
340
|
+
|
341
|
+
band_size = ops.minimum(key_len, self.sliding_window_size - 1)
|
342
|
+
band_size = ops.cast(band_size, "int32")
|
343
|
+
sliding_mask = tf.linalg.band_part(all_ones, band_size, band_size)
|
344
|
+
else:
|
345
|
+
sliding_mask = ops.triu(
|
346
|
+
all_ones, -1 * self.sliding_window_size + 1
|
347
|
+
) * ops.tril(all_ones, self.sliding_window_size - 1)
|
348
|
+
# Slice the window for short queries during generation.
|
349
|
+
start = (cache_update_index, 0)
|
350
|
+
sliding_mask = ops.slice(sliding_mask, start, (query_len, key_len))
|
351
|
+
sliding_mask = ops.expand_dims(sliding_mask, 0)
|
352
|
+
return ops.logical_and(attention_mask, ops.cast(sliding_mask, "bool"))
|
353
|
+
|
354
|
+
def get_config(self):
|
355
|
+
config = super().get_config()
|
356
|
+
config.update(
|
357
|
+
{
|
358
|
+
"num_query_heads": self.num_query_heads,
|
359
|
+
"num_key_value_heads": self.num_key_value_heads,
|
360
|
+
"rope_max_wavelength": self.rope_max_wavelength,
|
361
|
+
"rope_scaling_factor": self.rope_scaling_factor,
|
362
|
+
"kernel_initializer": keras.initializers.serialize(
|
363
|
+
self.kernel_initializer
|
364
|
+
),
|
365
|
+
"dropout": self.dropout,
|
366
|
+
"sliding_window_size": self.sliding_window_size,
|
367
|
+
}
|
368
|
+
)
|
369
|
+
return config
|
@@ -0,0 +1,191 @@
|
|
1
|
+
import keras
|
2
|
+
from keras import ops
|
3
|
+
|
4
|
+
from keras_hub.src.api_export import keras_hub_export
|
5
|
+
from keras_hub.src.layers.modeling.reversible_embedding import (
|
6
|
+
ReversibleEmbedding,
|
7
|
+
)
|
8
|
+
from keras_hub.src.models.backbone import Backbone
|
9
|
+
from keras_hub.src.models.qwen3.qwen3_decoder import Qwen3TransformerDecoder
|
10
|
+
from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
|
11
|
+
|
12
|
+
|
13
|
+
def _qwen3_kernel_initializer(stddev=0.02):
|
14
|
+
return keras.initializers.RandomNormal(stddev=stddev)
|
15
|
+
|
16
|
+
|
17
|
+
@keras_hub_export("keras_hub.models.Qwen3Backbone")
|
18
|
+
class Qwen3Backbone(Backbone):
|
19
|
+
"""The Qwen3 Transformer core architecture with hyperparameters.
|
20
|
+
|
21
|
+
This network implements a Transformer-based decoder network,
|
22
|
+
Qwen3, as described in the Qwen3 model architecture.
|
23
|
+
It includes the embedding lookups and transformer layers.
|
24
|
+
|
25
|
+
The default constructor gives a fully customizable, randomly initialized
|
26
|
+
Qwen3 model with any number of layers, heads, and embedding
|
27
|
+
dimensions. To load preset architectures and weights, use the `from_preset`
|
28
|
+
constructor.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
vocabulary_size (int): The size of the token vocabulary.
|
32
|
+
num_layers (int): The number of transformer layers.
|
33
|
+
num_query_heads (int): The number of query attention heads for
|
34
|
+
each transformer.
|
35
|
+
hidden_dim (int): The size of the transformer encoding and pooling
|
36
|
+
layers.
|
37
|
+
intermediate_dim (int): The output dimension of the first Dense layer in
|
38
|
+
a three-layer feedforward network for each transformer.
|
39
|
+
num_key_value_heads (int): The number of key and value attention heads
|
40
|
+
for each transformer.
|
41
|
+
rope_max_wavelength (int, optional): The maximum angular wavelength of
|
42
|
+
the sine/cosine curves, for rotary embeddings. Defaults to `10000`.
|
43
|
+
rope_scaling_factor (float, optional): The scaling factor for
|
44
|
+
calculation of rotary embedding. Defaults to `1.0`.
|
45
|
+
layer_norm_epsilon (float, optional): Epsilon for the layer
|
46
|
+
normalization layers in the transformer decoder. Defaults to `1e-6`.
|
47
|
+
dropout (float, optional): Dropout rate for attention and hidden layers.
|
48
|
+
Defaults to `0`.
|
49
|
+
dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
|
50
|
+
for model computations and weights. Note that some computations,
|
51
|
+
such as softmax and layer normalization, will always be done at
|
52
|
+
float32 precision regardless of dtype.
|
53
|
+
tie_word_embeddings (bool, optional): Whether to tie input and output
|
54
|
+
embeddings. Defaults to `True`.
|
55
|
+
sliding_window_size (int, optional): Size of the sliding window for
|
56
|
+
attention when enabled. Defaults to `32768`.
|
57
|
+
|
58
|
+
Examples:
|
59
|
+
|
60
|
+
```python
|
61
|
+
input_data = {
|
62
|
+
"token_ids": np.ones(shape=(1, 12), dtype="int32"),
|
63
|
+
"padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
|
64
|
+
}
|
65
|
+
|
66
|
+
# Pretrained Qwen3 decoder.
|
67
|
+
model = keras_hub.models.Qwen3Backbone.from_preset("qwen32.5_0.5b_en")
|
68
|
+
model(input_data)
|
69
|
+
|
70
|
+
# Randomly initialized Qwen3 decoder with custom config.
|
71
|
+
model = keras_hub.models.Qwen3Backbone(
|
72
|
+
vocabulary_size=10,
|
73
|
+
hidden_dim=512,
|
74
|
+
num_layers=2,
|
75
|
+
num_query_heads=32,
|
76
|
+
num_key_value_heads=8,
|
77
|
+
intermediate_dim=1024,
|
78
|
+
layer_norm_epsilon=1e-6,
|
79
|
+
dtype="float32"
|
80
|
+
)
|
81
|
+
model(input_data)
|
82
|
+
```
|
83
|
+
"""
|
84
|
+
|
85
|
+
def __init__(
|
86
|
+
self,
|
87
|
+
vocabulary_size,
|
88
|
+
num_layers,
|
89
|
+
num_query_heads,
|
90
|
+
num_key_value_heads,
|
91
|
+
head_dim,
|
92
|
+
hidden_dim,
|
93
|
+
intermediate_dim,
|
94
|
+
rope_max_wavelength=10000,
|
95
|
+
rope_scaling_factor=1.0,
|
96
|
+
layer_norm_epsilon=1e-6,
|
97
|
+
dropout=0.0,
|
98
|
+
tie_word_embeddings=True,
|
99
|
+
sliding_window_size=32768,
|
100
|
+
dtype=None,
|
101
|
+
**kwargs,
|
102
|
+
):
|
103
|
+
# === Layers ===
|
104
|
+
self.token_embedding = ReversibleEmbedding(
|
105
|
+
input_dim=vocabulary_size,
|
106
|
+
output_dim=hidden_dim,
|
107
|
+
tie_weights=tie_word_embeddings,
|
108
|
+
embeddings_initializer=_qwen3_kernel_initializer(stddev=0.01),
|
109
|
+
dtype=dtype,
|
110
|
+
name="token_embedding",
|
111
|
+
)
|
112
|
+
self.transformer_layers = []
|
113
|
+
for i in range(num_layers):
|
114
|
+
layer = Qwen3TransformerDecoder(
|
115
|
+
intermediate_dim=intermediate_dim,
|
116
|
+
head_dim=head_dim,
|
117
|
+
num_query_heads=num_query_heads,
|
118
|
+
num_key_value_heads=num_key_value_heads,
|
119
|
+
rope_max_wavelength=rope_max_wavelength,
|
120
|
+
rope_scaling_factor=rope_scaling_factor,
|
121
|
+
layer_norm_epsilon=layer_norm_epsilon,
|
122
|
+
activation=ops.silu,
|
123
|
+
kernel_initializer=_qwen3_kernel_initializer(stddev=0.02),
|
124
|
+
dropout=dropout,
|
125
|
+
sliding_window_size=sliding_window_size,
|
126
|
+
dtype=dtype,
|
127
|
+
name=f"transformer_layer_{i}",
|
128
|
+
)
|
129
|
+
self.transformer_layers.append(layer)
|
130
|
+
self.layer_norm = Qwen3LayerNorm(
|
131
|
+
epsilon=layer_norm_epsilon,
|
132
|
+
dtype=dtype,
|
133
|
+
name="sequence_output_layernorm",
|
134
|
+
)
|
135
|
+
|
136
|
+
# === Functional Model ===
|
137
|
+
token_id_input = keras.Input(
|
138
|
+
shape=(None,), dtype="int32", name="token_ids"
|
139
|
+
)
|
140
|
+
padding_mask_input = keras.Input(
|
141
|
+
shape=(None,), dtype="int32", name="padding_mask"
|
142
|
+
)
|
143
|
+
x = self.token_embedding(token_id_input)
|
144
|
+
for transformer_layer in self.transformer_layers:
|
145
|
+
x = transformer_layer(x, decoder_padding_mask=padding_mask_input)
|
146
|
+
sequence_output = self.layer_norm(x)
|
147
|
+
super().__init__(
|
148
|
+
inputs={
|
149
|
+
"token_ids": token_id_input,
|
150
|
+
"padding_mask": padding_mask_input,
|
151
|
+
},
|
152
|
+
outputs=sequence_output,
|
153
|
+
dtype=dtype,
|
154
|
+
**kwargs,
|
155
|
+
)
|
156
|
+
|
157
|
+
# === Config ===
|
158
|
+
self.vocabulary_size = vocabulary_size
|
159
|
+
self.num_layers = num_layers
|
160
|
+
self.num_query_heads = num_query_heads
|
161
|
+
self.hidden_dim = hidden_dim
|
162
|
+
self.head_dim = head_dim
|
163
|
+
self.intermediate_dim = intermediate_dim
|
164
|
+
self.rope_max_wavelength = rope_max_wavelength
|
165
|
+
self.num_key_value_heads = num_key_value_heads
|
166
|
+
self.rope_scaling_factor = rope_scaling_factor
|
167
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
168
|
+
self.dropout = dropout
|
169
|
+
self.tie_word_embeddings = tie_word_embeddings
|
170
|
+
self.sliding_window_size = sliding_window_size
|
171
|
+
|
172
|
+
def get_config(self):
|
173
|
+
config = super().get_config()
|
174
|
+
config.update(
|
175
|
+
{
|
176
|
+
"vocabulary_size": self.vocabulary_size,
|
177
|
+
"num_layers": self.num_layers,
|
178
|
+
"num_query_heads": self.num_query_heads,
|
179
|
+
"hidden_dim": self.hidden_dim,
|
180
|
+
"head_dim": self.head_dim,
|
181
|
+
"intermediate_dim": self.intermediate_dim,
|
182
|
+
"rope_max_wavelength": self.rope_max_wavelength,
|
183
|
+
"rope_scaling_factor": self.rope_scaling_factor,
|
184
|
+
"num_key_value_heads": self.num_key_value_heads,
|
185
|
+
"layer_norm_epsilon": self.layer_norm_epsilon,
|
186
|
+
"dropout": self.dropout,
|
187
|
+
"tie_word_embeddings": self.tie_word_embeddings,
|
188
|
+
"sliding_window_size": self.sliding_window_size,
|
189
|
+
}
|
190
|
+
)
|
191
|
+
return config
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from keras_hub.src.api_export import keras_hub_export
|
2
|
+
from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
|
3
|
+
from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
|
4
|
+
from keras_hub.src.models.qwen3.qwen3_tokenizer import Qwen3Tokenizer
|
5
|
+
|
6
|
+
|
7
|
+
@keras_hub_export("keras_hub.models.Qwen3CausalLMPreprocessor")
|
8
|
+
class Qwen3CausalLMPreprocessor(CausalLMPreprocessor):
|
9
|
+
backbone_cls = Qwen3Backbone
|
10
|
+
tokenizer_cls = Qwen3Tokenizer
|
@@ -0,0 +1,309 @@
|
|
1
|
+
import keras
|
2
|
+
from keras import ops
|
3
|
+
|
4
|
+
from keras_hub.src.layers.modeling.transformer_layer_utils import (
|
5
|
+
compute_causal_mask,
|
6
|
+
)
|
7
|
+
from keras_hub.src.layers.modeling.transformer_layer_utils import (
|
8
|
+
merge_padding_and_attention_mask,
|
9
|
+
)
|
10
|
+
from keras_hub.src.models.qwen3.qwen3_attention import Qwen3Attention
|
11
|
+
from keras_hub.src.models.qwen3.qwen3_layernorm import Qwen3LayerNorm
|
12
|
+
from keras_hub.src.utils.keras_utils import clone_initializer
|
13
|
+
|
14
|
+
|
15
|
+
class Qwen3TransformerDecoder(keras.layers.Layer):
|
16
|
+
"""A Transformer decoder layer for the Qwen3 backbone.
|
17
|
+
|
18
|
+
This layer implements a Transformer decoder block that includes
|
19
|
+
self-attention with optional sliding window attention and a feed-forward
|
20
|
+
network.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
intermediate_dim: Output dimension of the first dense layer in the
|
24
|
+
feed-forward network.
|
25
|
+
num_query_heads: Number of query attention heads.
|
26
|
+
num_key_value_heads: Number of key/value attention heads (for GQA).
|
27
|
+
rope_max_wavelength: Maximum wavelength for RoPE (Rotary Position
|
28
|
+
Embedding).
|
29
|
+
rope_scaling_factor: Scaling factor for RoPE, used for extending
|
30
|
+
context length.
|
31
|
+
activation: Activation function to use in the feed-forward network.
|
32
|
+
layer_norm_epsilon: Small float added to variance to avoid dividing
|
33
|
+
by zero in layer norm.
|
34
|
+
kernel_initializer: Initializer for the kernel weights.
|
35
|
+
dropout: Dropout rate for attention and hidden layers.
|
36
|
+
sliding_window_size: Size of the sliding window for attention when
|
37
|
+
enabled.
|
38
|
+
**kwargs: Additional keyword arguments to pass to the Layer.
|
39
|
+
"""
|
40
|
+
|
41
|
+
def __init__(
|
42
|
+
self,
|
43
|
+
intermediate_dim,
|
44
|
+
num_query_heads,
|
45
|
+
num_key_value_heads,
|
46
|
+
head_dim,
|
47
|
+
rope_max_wavelength=10000,
|
48
|
+
rope_scaling_factor=1.0,
|
49
|
+
activation="silu",
|
50
|
+
layer_norm_epsilon=1e-5,
|
51
|
+
kernel_initializer="glorot_uniform",
|
52
|
+
dropout=0.0,
|
53
|
+
sliding_window_size=None,
|
54
|
+
**kwargs,
|
55
|
+
):
|
56
|
+
super().__init__(**kwargs)
|
57
|
+
self.intermediate_dim = intermediate_dim
|
58
|
+
self.num_query_heads = num_query_heads
|
59
|
+
self.num_key_value_heads = num_key_value_heads
|
60
|
+
self.head_dim = head_dim
|
61
|
+
|
62
|
+
self.rope_max_wavelength = rope_max_wavelength
|
63
|
+
self.rope_scaling_factor = rope_scaling_factor
|
64
|
+
|
65
|
+
self.dropout = dropout
|
66
|
+
|
67
|
+
self.sliding_window_size = sliding_window_size
|
68
|
+
|
69
|
+
self.activation = keras.activations.get(activation)
|
70
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
71
|
+
self.kernel_initializer = keras.initializers.get(kernel_initializer)
|
72
|
+
|
73
|
+
self.supports_masking = True
|
74
|
+
|
75
|
+
def build(self, decoder_sequence_shape):
|
76
|
+
self._decoder_sequence_shape = decoder_sequence_shape
|
77
|
+
self.hidden_dim = decoder_sequence_shape[-1]
|
78
|
+
|
79
|
+
# Self attention layer.
|
80
|
+
self._self_attention_layer = Qwen3Attention(
|
81
|
+
num_query_heads=self.num_query_heads,
|
82
|
+
num_key_value_heads=self.num_key_value_heads,
|
83
|
+
rope_max_wavelength=self.rope_max_wavelength,
|
84
|
+
head_dim=self.head_dim,
|
85
|
+
rope_scaling_factor=self.rope_scaling_factor,
|
86
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
87
|
+
dropout=self.dropout,
|
88
|
+
sliding_window_size=self.sliding_window_size,
|
89
|
+
dtype=self.dtype_policy,
|
90
|
+
name="self_attention",
|
91
|
+
)
|
92
|
+
self._self_attention_layer.build(decoder_sequence_shape)
|
93
|
+
|
94
|
+
self._self_attention_layernorm = Qwen3LayerNorm(
|
95
|
+
epsilon=self.layer_norm_epsilon,
|
96
|
+
dtype=self.dtype_policy,
|
97
|
+
name="self_attention_layernorm",
|
98
|
+
)
|
99
|
+
|
100
|
+
self._self_attention_layernorm.build(decoder_sequence_shape)
|
101
|
+
self._self_attention_dropout = keras.layers.Dropout(
|
102
|
+
rate=self.dropout,
|
103
|
+
dtype=self.dtype_policy,
|
104
|
+
name="self_attention_dropout",
|
105
|
+
)
|
106
|
+
|
107
|
+
# Feedforward layers.
|
108
|
+
self._feedforward_intermediate_dense = keras.layers.Dense(
|
109
|
+
self.intermediate_dim,
|
110
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
111
|
+
use_bias=False,
|
112
|
+
dtype=self.dtype_policy,
|
113
|
+
name="feedforward_intermediate_dense",
|
114
|
+
)
|
115
|
+
self._feedforward_intermediate_dense.build(decoder_sequence_shape)
|
116
|
+
|
117
|
+
self._feedforward_gate_dense = keras.layers.Dense(
|
118
|
+
self.intermediate_dim,
|
119
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
120
|
+
use_bias=False,
|
121
|
+
dtype=self.dtype_policy,
|
122
|
+
name="feedforward_gate_dense",
|
123
|
+
)
|
124
|
+
self._feedforward_gate_dense.build(decoder_sequence_shape)
|
125
|
+
|
126
|
+
self._feedforward_output_dense = keras.layers.Dense(
|
127
|
+
self.hidden_dim,
|
128
|
+
kernel_initializer=clone_initializer(self.kernel_initializer),
|
129
|
+
use_bias=False,
|
130
|
+
dtype=self.dtype_policy,
|
131
|
+
name="feedforward_output_dense",
|
132
|
+
)
|
133
|
+
|
134
|
+
self._feedforward_output_dense.build(
|
135
|
+
self._feedforward_gate_dense.compute_output_shape(
|
136
|
+
decoder_sequence_shape
|
137
|
+
)
|
138
|
+
)
|
139
|
+
|
140
|
+
self._feedforward_layernorm = Qwen3LayerNorm(
|
141
|
+
epsilon=self.layer_norm_epsilon,
|
142
|
+
dtype=self.dtype_policy,
|
143
|
+
name="feedforward_layernorm",
|
144
|
+
)
|
145
|
+
self._feedforward_layernorm.build(decoder_sequence_shape)
|
146
|
+
|
147
|
+
self.built = True
|
148
|
+
|
149
|
+
def call(
|
150
|
+
self,
|
151
|
+
decoder_sequence,
|
152
|
+
decoder_padding_mask=None,
|
153
|
+
decoder_attention_mask=None,
|
154
|
+
self_attention_cache=None,
|
155
|
+
self_attention_cache_update_index=None,
|
156
|
+
training=None,
|
157
|
+
):
|
158
|
+
"""Forward pass for the decoder layer.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
decoder_sequence: Input tensor of shape [batch_size, seq_length,
|
162
|
+
hidden_size].
|
163
|
+
decoder_padding_mask: Mask tensor for padding tokens.
|
164
|
+
decoder_attention_mask: Additional attention mask.
|
165
|
+
self_attention_cache: Optional cached key and value tensors for
|
166
|
+
self-attention.
|
167
|
+
self_attention_cache_update_index: Index at which to update the
|
168
|
+
cache.
|
169
|
+
training: Boolean indicating whether in training mode.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
decoder_output: Output tensor after applying transformer decoder
|
173
|
+
block.
|
174
|
+
self_attention_cache: Updated cache tensors (if cache is provided).
|
175
|
+
"""
|
176
|
+
self_attention_mask = self._compute_self_attention_mask(
|
177
|
+
decoder_sequence=decoder_sequence,
|
178
|
+
decoder_padding_mask=decoder_padding_mask,
|
179
|
+
decoder_attention_mask=decoder_attention_mask,
|
180
|
+
self_attention_cache=self_attention_cache,
|
181
|
+
self_attention_cache_update_index=self_attention_cache_update_index,
|
182
|
+
)
|
183
|
+
residual = decoder_sequence
|
184
|
+
|
185
|
+
x = self._self_attention_layernorm(decoder_sequence)
|
186
|
+
|
187
|
+
# Self attention block.
|
188
|
+
x = self._self_attention_layer(
|
189
|
+
hidden_states=x,
|
190
|
+
attention_mask=self_attention_mask,
|
191
|
+
cache=self_attention_cache,
|
192
|
+
cache_update_index=self_attention_cache_update_index,
|
193
|
+
)
|
194
|
+
|
195
|
+
if self_attention_cache is not None:
|
196
|
+
x, self_attention_cache = x
|
197
|
+
|
198
|
+
x = self._self_attention_dropout(x, training=training)
|
199
|
+
|
200
|
+
x = x + residual
|
201
|
+
residual = x
|
202
|
+
|
203
|
+
x = self._feedforward_layernorm(x)
|
204
|
+
gate_output = self._feedforward_gate_dense(x)
|
205
|
+
|
206
|
+
# Note that we run the activation function in full 32-bit
|
207
|
+
# precision since this is what `torch.nn.functional.silu`
|
208
|
+
# does. Internally, `torch.nn.functional.silu` converts the
|
209
|
+
# inputs to float32, computes SiLU, and converts the outputs
|
210
|
+
# back to compute dtype.
|
211
|
+
# CPU Kernel: https://github.com/pytorch/pytorch/blob/35c493f2cf9b623bfdc7e6b34dc1cb39690a7919/aten/src/ATen/native/cpu/Activation.cpp#L1221-L1235 # noqa: E501
|
212
|
+
# CUDA Kernel: https://github.com/pytorch/pytorch/blob/35c493f2cf9b623bfdc7e6b34dc1cb39690a7919/aten/src/ATen/native/cuda/ActivationSiluKernel.cu # noqa: E501
|
213
|
+
gate_output = ops.cast(gate_output, "float32")
|
214
|
+
gate_output = self.activation(gate_output)
|
215
|
+
gate_output = ops.cast(gate_output, self.compute_dtype)
|
216
|
+
|
217
|
+
x = self._feedforward_intermediate_dense(x)
|
218
|
+
|
219
|
+
x = self._feedforward_output_dense(ops.multiply(x, gate_output))
|
220
|
+
|
221
|
+
decoder_output = x + residual
|
222
|
+
|
223
|
+
if self_attention_cache is not None:
|
224
|
+
return decoder_output, self_attention_cache
|
225
|
+
return decoder_output
|
226
|
+
|
227
|
+
def _compute_self_attention_mask(
|
228
|
+
self,
|
229
|
+
decoder_sequence,
|
230
|
+
decoder_padding_mask,
|
231
|
+
decoder_attention_mask,
|
232
|
+
self_attention_cache,
|
233
|
+
self_attention_cache_update_index,
|
234
|
+
):
|
235
|
+
"""Computes the self-attention mask combining causal, padding and
|
236
|
+
attention masks.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
decoder_sequence: Input tensor.
|
240
|
+
decoder_padding_mask: Mask tensor for padding tokens.
|
241
|
+
decoder_attention_mask: Additional attention mask.
|
242
|
+
self_attention_cache: Optional cached key and value tensors.
|
243
|
+
self_attention_cache_update_index: Index at which to update the
|
244
|
+
cache.
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
Combined attention mask tensor.
|
248
|
+
"""
|
249
|
+
decoder_mask = merge_padding_and_attention_mask(
|
250
|
+
decoder_sequence, decoder_padding_mask, decoder_attention_mask
|
251
|
+
)
|
252
|
+
batch_size = ops.shape(decoder_sequence)[0]
|
253
|
+
input_length = output_length = ops.shape(decoder_sequence)[1]
|
254
|
+
# We need to handle a rectangular causal mask when doing cached
|
255
|
+
# decoding. For generative inference, `decoder_sequence` will
|
256
|
+
# generally be length 1, and `cache` will be the full generation length.
|
257
|
+
if self_attention_cache is not None:
|
258
|
+
input_length = ops.shape(self_attention_cache)[2]
|
259
|
+
|
260
|
+
cache_update_index = (
|
261
|
+
0
|
262
|
+
if self_attention_cache_update_index is None
|
263
|
+
else self_attention_cache_update_index
|
264
|
+
)
|
265
|
+
|
266
|
+
causal_mask = compute_causal_mask(
|
267
|
+
batch_size, input_length, output_length, cache_update_index
|
268
|
+
)
|
269
|
+
|
270
|
+
return (
|
271
|
+
ops.minimum(decoder_mask, causal_mask)
|
272
|
+
if decoder_mask is not None
|
273
|
+
else causal_mask
|
274
|
+
)
|
275
|
+
|
276
|
+
def compute_output_shape(self, decoder_sequence_shape):
|
277
|
+
"""Computes the output shape of the layer.
|
278
|
+
|
279
|
+
Args:
|
280
|
+
decoder_sequence_shape: Shape of the decoder sequence input.
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
Output shape, which is the same as the input shape.
|
284
|
+
"""
|
285
|
+
return decoder_sequence_shape
|
286
|
+
|
287
|
+
def get_config(self):
|
288
|
+
"""Returns the config of the layer.
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
Dictionary containing the parameters used to initialize this layer.
|
292
|
+
"""
|
293
|
+
config = super().get_config()
|
294
|
+
config.update(
|
295
|
+
{
|
296
|
+
"intermediate_dim": self.intermediate_dim,
|
297
|
+
"num_query_heads": self.num_query_heads,
|
298
|
+
"rope_max_wavelength": self.rope_max_wavelength,
|
299
|
+
"rope_scaling_factor": self.rope_scaling_factor,
|
300
|
+
"num_key_value_heads": self.num_key_value_heads,
|
301
|
+
"activation": keras.activations.serialize(self.activation),
|
302
|
+
"layer_norm_epsilon": self.layer_norm_epsilon,
|
303
|
+
"kernel_initializer": keras.initializers.serialize(
|
304
|
+
self.kernel_initializer
|
305
|
+
),
|
306
|
+
"dropout": self.dropout,
|
307
|
+
}
|
308
|
+
)
|
309
|
+
return config
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import keras
|
2
|
+
from keras import ops
|
3
|
+
|
4
|
+
|
5
|
+
class Qwen3LayerNorm(keras.layers.Layer):
|
6
|
+
"""A normalization layer for Qwen that implements RMS normalization."""
|
7
|
+
|
8
|
+
def __init__(self, head_dim=None, epsilon=1e-6, **kwargs):
|
9
|
+
super().__init__(**kwargs)
|
10
|
+
self.head_dim = head_dim
|
11
|
+
self.epsilon = epsilon
|
12
|
+
|
13
|
+
def build(self, input_shape):
|
14
|
+
if self.head_dim:
|
15
|
+
dim = self.head_dim
|
16
|
+
else:
|
17
|
+
dim = input_shape[-1]
|
18
|
+
|
19
|
+
self.scale = self.add_weight(
|
20
|
+
name="scale",
|
21
|
+
trainable=True,
|
22
|
+
shape=(dim,),
|
23
|
+
initializer="ones",
|
24
|
+
dtype=self.variable_dtype,
|
25
|
+
)
|
26
|
+
self.built = True
|
27
|
+
|
28
|
+
def call(self, x):
|
29
|
+
input_dtype = x.dtype
|
30
|
+
x = ops.cast(x, "float32")
|
31
|
+
var = ops.mean(ops.power(x, 2), axis=-1, keepdims=True)
|
32
|
+
x = x * ops.rsqrt(var + self.epsilon)
|
33
|
+
return ops.cast(x * self.scale, input_dtype)
|
34
|
+
|
35
|
+
def get_config(self):
|
36
|
+
config = super().get_config()
|
37
|
+
config.update({"epsilon": self.epsilon})
|
38
|
+
return config
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from keras_hub.src.api_export import keras_hub_export
|
2
|
+
from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
|
3
|
+
from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
|
4
|
+
|
5
|
+
|
6
|
+
@keras_hub_export(
|
7
|
+
"keras_hub.models.Qwen3Tokenizer",
|
8
|
+
)
|
9
|
+
class Qwen3Tokenizer(BytePairTokenizer):
|
10
|
+
"""Tokenizer for Qwen3 models.
|
11
|
+
|
12
|
+
This tokenizer implements byte-pair encoding (BPE) for Qwen3 models,
|
13
|
+
handling special tokens like BOS (beginning of sequence) and EOS (end of
|
14
|
+
sequence).
|
15
|
+
|
16
|
+
Args:
|
17
|
+
vocabulary: Dictionary mapping tokens to token IDs, or path to
|
18
|
+
vocabulary file.
|
19
|
+
merges: List of BPE merges, or path to merges file.
|
20
|
+
bos_token: Beginning of sequence token. Defaults to None.
|
21
|
+
eos_token: End of sequence token. Defaults to "<|endoftext|>".
|
22
|
+
misc_special_tokens: Set of additional special tokens. Defaults to
|
23
|
+
empty set.
|
24
|
+
"""
|
25
|
+
|
26
|
+
backbone_cls = Qwen3Backbone
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
vocabulary=None,
|
31
|
+
merges=None,
|
32
|
+
**kwargs,
|
33
|
+
):
|
34
|
+
# Add EOS token
|
35
|
+
eos_token = "<|im_end|>"
|
36
|
+
self._add_special_token(eos_token, "end_token")
|
37
|
+
|
38
|
+
pad_token = "<|endoftext|>"
|
39
|
+
self._add_special_token(pad_token, "pad_token")
|
40
|
+
|
41
|
+
self.start_token_id = None
|
42
|
+
self.start_token = None
|
43
|
+
|
44
|
+
super().__init__(
|
45
|
+
vocabulary=vocabulary,
|
46
|
+
merges=merges,
|
47
|
+
**kwargs,
|
48
|
+
)
|
@@ -4,8 +4,8 @@ backbone_presets = {
|
|
4
4
|
"qwen1.5_moe_2.7b_en": {
|
5
5
|
"metadata": {
|
6
6
|
"description": (
|
7
|
-
"24-layer Qwen MoE model with 2.7 billion active parameters "
|
8
|
-
"and 8 experts per MoE layer."
|
7
|
+
"24-layer Qwen MoE model with 2.7 billion active parameters "
|
8
|
+
"and 8 experts per MoE layer."
|
9
9
|
),
|
10
10
|
"params": 14315784192,
|
11
11
|
"path": "qwen-1.5-moe",
|
@@ -0,0 +1,145 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from keras_hub.src.models.qwen3.qwen3_backbone import Qwen3Backbone
|
4
|
+
from keras_hub.src.utils.preset_utils import load_json
|
5
|
+
|
6
|
+
backbone_cls = Qwen3Backbone
|
7
|
+
|
8
|
+
|
9
|
+
def convert_backbone_config(transformers_config):
|
10
|
+
return {
|
11
|
+
"vocabulary_size": transformers_config["vocab_size"],
|
12
|
+
"head_dim": transformers_config["head_dim"],
|
13
|
+
"hidden_dim": transformers_config["hidden_size"],
|
14
|
+
"num_layers": transformers_config["num_hidden_layers"],
|
15
|
+
"num_query_heads": transformers_config["num_attention_heads"],
|
16
|
+
"num_key_value_heads": transformers_config["num_key_value_heads"],
|
17
|
+
"intermediate_dim": transformers_config["intermediate_size"],
|
18
|
+
"layer_norm_epsilon": transformers_config["rms_norm_eps"],
|
19
|
+
"rope_max_wavelength": transformers_config["rope_theta"],
|
20
|
+
"sliding_window_size": transformers_config["sliding_window"]
|
21
|
+
if transformers_config["use_sliding_window"]
|
22
|
+
else None,
|
23
|
+
"tie_word_embeddings": transformers_config["tie_word_embeddings"],
|
24
|
+
}
|
25
|
+
|
26
|
+
|
27
|
+
def convert_weights(backbone, loader, transformers_config):
|
28
|
+
loader.port_weight(
|
29
|
+
keras_variable=backbone.get_layer("token_embedding").embeddings,
|
30
|
+
hf_weight_key="model.embed_tokens.weight",
|
31
|
+
)
|
32
|
+
if not backbone.tie_word_embeddings:
|
33
|
+
loader.port_weight(
|
34
|
+
keras_variable=backbone.get_layer(
|
35
|
+
"token_embedding"
|
36
|
+
).reverse_embeddings,
|
37
|
+
hf_weight_key="lm_head.weight",
|
38
|
+
# rearrange_pattern="b a -> a b",
|
39
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
40
|
+
)
|
41
|
+
|
42
|
+
def transpose_and_reshape(x, shape):
|
43
|
+
return np.reshape(np.transpose(x), shape)
|
44
|
+
|
45
|
+
for i in range(backbone.num_layers):
|
46
|
+
decoder_layer = backbone.get_layer(f"transformer_layer_{i}")
|
47
|
+
|
48
|
+
# Input layernorm
|
49
|
+
loader.port_weight(
|
50
|
+
keras_variable=decoder_layer._self_attention_layernorm.scale,
|
51
|
+
hf_weight_key=f"model.layers.{i}.input_layernorm.weight",
|
52
|
+
)
|
53
|
+
|
54
|
+
# Attention layers
|
55
|
+
|
56
|
+
## Query
|
57
|
+
loader.port_weight(
|
58
|
+
keras_variable=decoder_layer._self_attention_layer._query_dense.kernel,
|
59
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_proj.weight",
|
60
|
+
hook_fn=transpose_and_reshape,
|
61
|
+
)
|
62
|
+
loader.port_weight(
|
63
|
+
keras_variable=decoder_layer._self_attention_layer._query_dense_layer_norm.scale,
|
64
|
+
hf_weight_key=f"model.layers.{i}.self_attn.q_norm.weight",
|
65
|
+
)
|
66
|
+
## Key
|
67
|
+
loader.port_weight(
|
68
|
+
keras_variable=decoder_layer._self_attention_layer._key_dense.kernel,
|
69
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_proj.weight",
|
70
|
+
hook_fn=transpose_and_reshape,
|
71
|
+
)
|
72
|
+
loader.port_weight(
|
73
|
+
keras_variable=decoder_layer._self_attention_layer._key_dense_layer_norm.scale,
|
74
|
+
hf_weight_key=f"model.layers.{i}.self_attn.k_norm.weight",
|
75
|
+
)
|
76
|
+
## Value
|
77
|
+
loader.port_weight(
|
78
|
+
keras_variable=decoder_layer._self_attention_layer._value_dense.kernel,
|
79
|
+
hf_weight_key=f"model.layers.{i}.self_attn.v_proj.weight",
|
80
|
+
hook_fn=transpose_and_reshape,
|
81
|
+
)
|
82
|
+
## Output
|
83
|
+
loader.port_weight(
|
84
|
+
keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
|
85
|
+
hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
|
86
|
+
# rearrange_patterns="c (a b) -> a b c",
|
87
|
+
# rearrange_dims={"a": backbone.num_query_heads},
|
88
|
+
hook_fn=transpose_and_reshape,
|
89
|
+
)
|
90
|
+
|
91
|
+
# MLP layers
|
92
|
+
loader.port_weight(
|
93
|
+
keras_variable=decoder_layer._feedforward_intermediate_dense.kernel,
|
94
|
+
hf_weight_key=f"model.layers.{i}.mlp.up_proj.weight",
|
95
|
+
# rearrange_patterns="b a -> a b",
|
96
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
97
|
+
)
|
98
|
+
loader.port_weight(
|
99
|
+
keras_variable=decoder_layer._feedforward_output_dense.kernel,
|
100
|
+
hf_weight_key=f"model.layers.{i}.mlp.down_proj.weight",
|
101
|
+
# rearrange_patterns="b a -> a b",
|
102
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
103
|
+
)
|
104
|
+
loader.port_weight(
|
105
|
+
keras_variable=decoder_layer._feedforward_gate_dense.kernel,
|
106
|
+
hf_weight_key=f"model.layers.{i}.mlp.gate_proj.weight",
|
107
|
+
# rearrange_patterns="b a -> a b",
|
108
|
+
hook_fn=lambda hf_tensor, _: np.transpose(hf_tensor, axes=(1, 0)),
|
109
|
+
)
|
110
|
+
|
111
|
+
# Feedforward layernorm
|
112
|
+
loader.port_weight(
|
113
|
+
keras_variable=decoder_layer._feedforward_layernorm.scale,
|
114
|
+
hf_weight_key=f"model.layers.{i}.post_attention_layernorm.weight",
|
115
|
+
)
|
116
|
+
|
117
|
+
# Final normalization layer
|
118
|
+
loader.port_weight(
|
119
|
+
keras_variable=backbone.get_layer("sequence_output_layernorm").scale,
|
120
|
+
hf_weight_key="model.norm.weight",
|
121
|
+
)
|
122
|
+
|
123
|
+
return backbone
|
124
|
+
|
125
|
+
|
126
|
+
def convert_tokenizer(cls, preset, **kwargs):
|
127
|
+
tokenizer_config = load_json(preset, "tokenizer.json")
|
128
|
+
vocab = tokenizer_config["model"]["vocab"]
|
129
|
+
merges = tokenizer_config["model"]["merges"]
|
130
|
+
merges = [" ".join(item) for item in merges]
|
131
|
+
|
132
|
+
# Load all special tokens with the exception of "reserved" ones.
|
133
|
+
special_tokens = set()
|
134
|
+
for token in tokenizer_config["added_tokens"]:
|
135
|
+
if not token["content"].startswith("<|reserved_special_token_"):
|
136
|
+
vocab[token["content"]] = token["id"]
|
137
|
+
special_tokens.add(token["content"])
|
138
|
+
|
139
|
+
kwargs.update(
|
140
|
+
{
|
141
|
+
"unsplittable_tokens": list(special_tokens),
|
142
|
+
}
|
143
|
+
)
|
144
|
+
|
145
|
+
return cls(vocabulary=vocab, merges=merges, **kwargs)
|
@@ -14,6 +14,7 @@ from keras_hub.src.utils.transformers import convert_mistral
|
|
14
14
|
from keras_hub.src.utils.transformers import convert_mixtral
|
15
15
|
from keras_hub.src.utils.transformers import convert_pali_gemma
|
16
16
|
from keras_hub.src.utils.transformers import convert_qwen
|
17
|
+
from keras_hub.src.utils.transformers import convert_qwen3
|
17
18
|
from keras_hub.src.utils.transformers import convert_qwen_moe
|
18
19
|
from keras_hub.src.utils.transformers import convert_vit
|
19
20
|
from keras_hub.src.utils.transformers.safetensor_utils import SafetensorLoader
|
@@ -50,6 +51,8 @@ class TransformersPresetLoader(PresetLoader):
|
|
50
51
|
self.converter = convert_mixtral
|
51
52
|
elif model_type == "qwen2_moe":
|
52
53
|
self.converter = convert_qwen_moe
|
54
|
+
elif model_type == "qwen3":
|
55
|
+
self.converter = convert_qwen3
|
53
56
|
else:
|
54
57
|
raise ValueError(
|
55
58
|
"KerasHub has no converter for huggingface/transformers models "
|
keras_hub/src/version.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
keras_hub/__init__.py,sha256=bJbUZkqwhZvTb1Tqx1fbkq6mzBYiEyq-Hin3oQIkhdE,558
|
2
2
|
keras_hub/layers/__init__.py,sha256=gnvT-GuASB1hZwY4zrRkLs5yohSQu9Pp1SHDxsWPLY8,5081
|
3
3
|
keras_hub/metrics/__init__.py,sha256=KYalsMPBnfwim9BdGHFfJ5WxUKFXOQ1QoKIMT_0lwlM,439
|
4
|
-
keras_hub/models/__init__.py,sha256=
|
4
|
+
keras_hub/models/__init__.py,sha256=1ZKgLK4AZ44s_cH7vu6FvmVocxf0biLAnY_lEh3dgxw,26734
|
5
5
|
keras_hub/samplers/__init__.py,sha256=aFQIkiqbZpi8vjrPp2MVII4QUfE-eQjra5fMeHsoy7k,886
|
6
6
|
keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
|
8
|
-
keras_hub/src/version.py,sha256=
|
8
|
+
keras_hub/src/version.py,sha256=hJyx_F3-Sy3RRaPET6xBnbg7QRtPkFgRHC4_SaxL3bw,222
|
9
9
|
keras_hub/src/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
keras_hub/src/layers/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
keras_hub/src/layers/modeling/alibi_bias.py,sha256=1XBTHI52L_iJDhN_w5ydu_iMhCuTgQAxEPwcLA6BPuk,4411
|
@@ -259,7 +259,7 @@ keras_hub/src/models/mixtral/mixtral_causal_lm.py,sha256=JA1t6xTeaYX_fNo9ftRyvzd
|
|
259
259
|
keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py,sha256=q2qXa9QAUWBvOWv9DeNvwsBNXSORJAbQFoQsWQ7e8V8,3079
|
260
260
|
keras_hub/src/models/mixtral/mixtral_decoder.py,sha256=CvOjhTxPnGQ_HNknZXRI6Cx1kpuHG99_TiOh-mNcsDw,18190
|
261
261
|
keras_hub/src/models/mixtral/mixtral_layer_norm.py,sha256=zfbDKZEb45FTwP0zQd7WPPp8tuiGoSNfS-DRYWkZyWw,1031
|
262
|
-
keras_hub/src/models/mixtral/mixtral_presets.py,sha256=
|
262
|
+
keras_hub/src/models/mixtral/mixtral_presets.py,sha256=pi5hHcwVSqr7ytf4dSnU_ew_t7NYw7EsZrmklQDqDVo,852
|
263
263
|
keras_hub/src/models/mixtral/mixtral_tokenizer.py,sha256=Kc233k879QMyX164X_CzWbqpnqEkKWNqa648guTGkBk,661
|
264
264
|
keras_hub/src/models/mobilenet/__init__.py,sha256=hxkNGGj_iAMu62iooUDEPA818sNOIgjG7pXMLEMOsAE,275
|
265
265
|
keras_hub/src/models/mobilenet/mobilenet_backbone.py,sha256=aZBSFeLUObYYoi3od9DI1KfgPCqh5GHTcAI8Y2ZHShA,29536
|
@@ -311,8 +311,14 @@ keras_hub/src/models/qwen/qwen_causal_lm.py,sha256=_f-UHaKHp0ncxknpkpEJiW3jlng3E
|
|
311
311
|
keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py,sha256=Va-4TLJD3ycEnkS41rF3dVj4_6K0j-gxLTrREFRcyr0,609
|
312
312
|
keras_hub/src/models/qwen/qwen_decoder.py,sha256=utmAvZlU7_nP-6pjGPDinK4JaMzsQSwOARG0ote-jAg,11771
|
313
313
|
keras_hub/src/models/qwen/qwen_layernorm.py,sha256=DS35r3qd6g5ocL7Nhf_vNzLLMo1aI9VCSmL64dgNOYI,924
|
314
|
-
keras_hub/src/models/qwen/qwen_presets.py,sha256=
|
314
|
+
keras_hub/src/models/qwen/qwen_presets.py,sha256=1FkKV6M3yqJz4EP1xa7bEvfIQ721xXT-_ikjWX0xvww,1992
|
315
315
|
keras_hub/src/models/qwen/qwen_tokenizer.py,sha256=LCv3IyiDDHqVnM9N3lf5-BE3iwicIh0nKS1hjoPw9lE,1532
|
316
|
+
keras_hub/src/models/qwen3/qwen3_attention.py,sha256=sewLjli290XvJ1efGZJEAYqUZfRll7cmhu0258s4C48,13042
|
317
|
+
keras_hub/src/models/qwen3/qwen3_backbone.py,sha256=Ylpk_rRWWRxy8irlAPjJU-YrxYGpo8c9lSEO1zZl4gU,7456
|
318
|
+
keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py,sha256=H4g-bgvuhAUnDwjJovydK16Kes38ZFZWPvflrgHqZis,458
|
319
|
+
keras_hub/src/models/qwen3/qwen3_decoder.py,sha256=68s9jQj53zFmXE4-SGXKYHu546fXOyi9LUbnKk-HGYY,11595
|
320
|
+
keras_hub/src/models/qwen3/qwen3_layernorm.py,sha256=EJxjf7Pr6ufPQnNeuYQxkExzPjPk4PQxqMsoBeSEkDo,1073
|
321
|
+
keras_hub/src/models/qwen3/qwen3_tokenizer.py,sha256=LmPtg0vprMchDvYfTj8m5PraXI2QS3-YgdIIpIm5iAs,1448
|
316
322
|
keras_hub/src/models/qwen_moe/__init__.py,sha256=5D8GUmVDsJs0J4sVZHcXOLkZf12U96l-WtwyVee4lu8,267
|
317
323
|
keras_hub/src/models/qwen_moe/qwen_moe_attention.py,sha256=pE79_iHUm2LGkoWL6zMJw_pNfzIvmyq3yJaiq47W2TY,13242
|
318
324
|
keras_hub/src/models/qwen_moe/qwen_moe_backbone.py,sha256=nrfELvIvRLmrgKrUNXci2CrecmeI6bWzJj7HH-RcWJA,15341
|
@@ -320,7 +326,7 @@ keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py,sha256=MeP60v7GcN_SmH5_ULRpq
|
|
320
326
|
keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py,sha256=uKaXRrJs02vkVudjdehzJPp0B84tPMkxNHlp166kceE,589
|
321
327
|
keras_hub/src/models/qwen_moe/qwen_moe_decoder.py,sha256=kmUjLpYTbJQ3J_31qWhLOd0Dg2_9cl_JX_zM8ZMH1Qo,23130
|
322
328
|
keras_hub/src/models/qwen_moe/qwen_moe_layernorm.py,sha256=DbkWJo7U0-cwdZwHPeAnFznYwtao6o0fjpoDJ9UWnpc,927
|
323
|
-
keras_hub/src/models/qwen_moe/qwen_moe_presets.py,sha256=
|
329
|
+
keras_hub/src/models/qwen_moe/qwen_moe_presets.py,sha256=LhOA3Ow-z3cNTan4AOrtyCXS58EgfvO_gtqiZt5cUQc,455
|
324
330
|
keras_hub/src/models/qwen_moe/qwen_moe_tokenizer.py,sha256=2c3X8jNGO0q0UL5NtUqSgHWLqhyJGi2ohNcTeOGhd84,1407
|
325
331
|
keras_hub/src/models/resnet/__init__.py,sha256=C5UqlQ6apm8WSp1bnrxB6Bi3BGaknxRQs-r3b2wpaGA,257
|
326
332
|
keras_hub/src/models/resnet/resnet_backbone.py,sha256=Q7nlqcTXZzjqd0e-DsjHC4ok58yOX7qxseotym3uZpM,31276
|
@@ -496,13 +502,14 @@ keras_hub/src/utils/transformers/convert_mistral.py,sha256=kVhN9h1ZFVhwkNW8p3wnS
|
|
496
502
|
keras_hub/src/utils/transformers/convert_mixtral.py,sha256=PxeCY8Xe7U_caICugwOCEjuSZ51ZUtmef6rUxh-Wt54,5508
|
497
503
|
keras_hub/src/utils/transformers/convert_pali_gemma.py,sha256=B1leeDw96Yvu81hYumf66hIid07k5NLqoeWAJgPnaLs,10649
|
498
504
|
keras_hub/src/utils/transformers/convert_qwen.py,sha256=WUxMAEFVqRs7TRw7QU5TH3_ev4yf02R1xFVliMvTQqg,5886
|
505
|
+
keras_hub/src/utils/transformers/convert_qwen3.py,sha256=LIormvCMWPq6X9Wo2eNbADjtFZ0nI7tFGZFBxmo4GKw,5700
|
499
506
|
keras_hub/src/utils/transformers/convert_qwen_moe.py,sha256=a7R28aln-PdAcNuKAXdrtzvslho2Co6GypChxLMKPpc,10618
|
500
507
|
keras_hub/src/utils/transformers/convert_vit.py,sha256=9SUZ9utNJhW_5cj3acMn9cRy47u2eIcDsrhmzj77o9k,5187
|
501
|
-
keras_hub/src/utils/transformers/preset_loader.py,sha256=
|
508
|
+
keras_hub/src/utils/transformers/preset_loader.py,sha256=7tFnbyAiUCMcTG8VQ7Wpi-J7cvRoSZn-ZYE_l0xuh0M,4363
|
502
509
|
keras_hub/src/utils/transformers/safetensor_utils.py,sha256=CYUHyA4y-B61r7NDnCsFb4t_UmSwZ1k9L-8gzEd6KRg,3339
|
503
510
|
keras_hub/tokenizers/__init__.py,sha256=uMjjm0mzUkRb0e4Ac_JK8aJ9cKGUi5UqmzWoWAFJprE,4164
|
504
511
|
keras_hub/utils/__init__.py,sha256=jXPqVGBpJr_PpYmqD8aDG-fRMlxH-ulqCR2SZMn288Y,646
|
505
|
-
keras_hub_nightly-0.22.0.
|
506
|
-
keras_hub_nightly-0.22.0.
|
507
|
-
keras_hub_nightly-0.22.0.
|
508
|
-
keras_hub_nightly-0.22.0.
|
512
|
+
keras_hub_nightly-0.22.0.dev202505300409.dist-info/METADATA,sha256=hH3xqnggYJvyKQ7DG5U0pJyM8umkP1oRPj32GKEu1E8,7393
|
513
|
+
keras_hub_nightly-0.22.0.dev202505300409.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
514
|
+
keras_hub_nightly-0.22.0.dev202505300409.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
|
515
|
+
keras_hub_nightly-0.22.0.dev202505300409.dist-info/RECORD,,
|
File without changes
|