keras-hub-nightly 0.23.0.dev202509190415__py3-none-any.whl → 0.23.0.dev202509290422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of keras-hub-nightly might be problematic. Click here for more details.

Files changed (32) hide show
  1. keras_hub/layers/__init__.py +3 -0
  2. keras_hub/models/__init__.py +24 -0
  3. keras_hub/src/models/depth_anything/__init__.py +9 -0
  4. keras_hub/src/models/depth_anything/depth_anything_backbone.py +232 -0
  5. keras_hub/src/models/depth_anything/depth_anything_depth_estimator.py +70 -0
  6. keras_hub/src/models/depth_anything/depth_anything_depth_estimator_preprocessor.py +16 -0
  7. keras_hub/src/models/depth_anything/depth_anything_image_converter.py +10 -0
  8. keras_hub/src/models/depth_anything/depth_anything_layers.py +725 -0
  9. keras_hub/src/models/depth_anything/depth_anything_loss.py +89 -0
  10. keras_hub/src/models/depth_anything/depth_anything_presets.py +4 -0
  11. keras_hub/src/models/depth_anything/interpolate.py +62 -0
  12. keras_hub/src/models/depth_estimator.py +239 -0
  13. keras_hub/src/models/depth_estimator_preprocessor.py +78 -0
  14. keras_hub/src/models/dinov2/dinov2_backbone.py +29 -3
  15. keras_hub/src/models/dinov2/dinov2_layers.py +13 -3
  16. keras_hub/src/models/qwen3_moe/qwen3_moe_attention.py +371 -0
  17. keras_hub/src/models/qwen3_moe/qwen3_moe_backbone.py +365 -0
  18. keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm.py +357 -0
  19. keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_preprocessor.py +12 -0
  20. keras_hub/src/models/qwen3_moe/qwen3_moe_decoder.py +672 -0
  21. keras_hub/src/models/qwen3_moe/qwen3_moe_layernorm.py +45 -0
  22. keras_hub/src/models/qwen3_moe/qwen3_moe_tokenizer.py +48 -0
  23. keras_hub/src/tests/test_case.py +3 -2
  24. keras_hub/src/utils/transformers/convert_dinov2.py +1 -0
  25. keras_hub/src/utils/transformers/convert_qwen3_moe.py +216 -0
  26. keras_hub/src/utils/transformers/preset_loader.py +3 -0
  27. keras_hub/src/version.py +1 -1
  28. keras_hub/tokenizers/__init__.py +3 -0
  29. {keras_hub_nightly-0.23.0.dev202509190415.dist-info → keras_hub_nightly-0.23.0.dev202509290422.dist-info}/METADATA +1 -1
  30. {keras_hub_nightly-0.23.0.dev202509190415.dist-info → keras_hub_nightly-0.23.0.dev202509290422.dist-info}/RECORD +32 -13
  31. {keras_hub_nightly-0.23.0.dev202509190415.dist-info → keras_hub_nightly-0.23.0.dev202509290422.dist-info}/WHEEL +0 -0
  32. {keras_hub_nightly-0.23.0.dev202509190415.dist-info → keras_hub_nightly-0.23.0.dev202509290422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,365 @@
1
+ import keras
2
+ from keras import ops
3
+
4
+ from keras_hub.src.api_export import keras_hub_export
5
+ from keras_hub.src.layers.modeling.reversible_embedding import (
6
+ ReversibleEmbedding,
7
+ )
8
+ from keras_hub.src.models.backbone import Backbone
9
+ from keras_hub.src.models.qwen3_moe.qwen3_moe_decoder import (
10
+ Qwen3MoeTransformerDecoder,
11
+ )
12
+ from keras_hub.src.models.qwen3_moe.qwen3_moe_layernorm import Qwen3MoeLayerNorm
13
+
14
+
15
+ def _qwen3_moe_kernel_initializer(stddev=0.02):
16
+ return keras.initializers.RandomNormal(stddev=stddev)
17
+
18
+
19
+ @keras_hub_export(
20
+ "keras_hub.models.Qwen3MoeBackbone",
21
+ )
22
+ class Qwen3MoeBackbone(Backbone):
23
+ """Qwen3 MoE core network with hyperparameters.
24
+
25
+ This backbone implements the base Transformer network for the Qwen MoE
26
+ model. It includes embedding lookups and transformer layers with a Mixture
27
+ of Experts (MoE) architecture, where each layer uses a sparse set of experts
28
+ for efficient computation. This backbone outputs the final hidden states for
29
+ each token, not generative predictions over the vocabulary space. For higher
30
+ -level object for text generation, see `keras_hub.models.Qwen3MoeCausalLM`.
31
+
32
+ The default constructor gives a fully customizable, randomly initialized
33
+ Qwen MoE model with any number of layers, heads, and embedding dimensions.
34
+ To load preset architectures and weights, use the `from_preset` constructor.
35
+
36
+ Args:
37
+ vocabulary_size: int. The size of the token vocabulary.
38
+ num_layers: int. The number of transformer layers.
39
+ num_query_heads: int. The number of heads for the query projections in
40
+ the attention layer.
41
+ num_key_value_heads: int. The number of heads for the key and value
42
+ projections in the attention layer.
43
+ hidden_dim: int. The size of the transformer hidden state at the end of
44
+ each transformer layer.
45
+ intermediate_dim: int. The output dimension of the first Dense layer in
46
+ the feedforward network for each transformer.
47
+ moe_intermediate_dim: int. The intermediate dimension for each expert
48
+ in the MoE feedforward network.
49
+ num_experts: int. The number of experts in each MoE layer.
50
+ top_k: int. The number of top experts to select for each token in the
51
+ MoE layer.
52
+ head_dim: int. The size of each attention head.
53
+ layer_norm_epsilon: float. The epsilon value used for every layer norm
54
+ in the transformer model.
55
+ dropout: float. Dropout probability for the transformer encoder.
56
+ sliding_window_size: int. Size of the sliding local window. Defaults to
57
+ 4096.
58
+ max_sequence_length: int. The maximum sequence length supported by the
59
+ model. Defaults to 4096.
60
+ dtype: str or `keras.mixed_precision.DTypePolicy`. The dtype to use for
61
+ the model's computations and weights. Note that some computations,
62
+ such as softmax and layer normalization, will always be done at
63
+ float32 precision regardless of dtype.
64
+
65
+ Example:
66
+ ```python
67
+ input_data = {
68
+ "token_ids": np.ones(shape=(1, 12), dtype="int32"),
69
+ "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
70
+ }
71
+
72
+ # Pretrained Qwen MoE decoder.
73
+ model = keras_hub.models.Qwen3MoeBackbone.from_preset("qwen3_moe_a2_7b")
74
+ model(input_data)
75
+
76
+ # Randomly initialized Qwen MoE decoder with custom config.
77
+ model = keras_hub.models.Qwen3MoeBackbone(
78
+ vocabulary_size=151936,
79
+ num_layers=28,
80
+ num_query_heads=16,
81
+ num_key_value_heads=8,
82
+ hidden_dim=2048,
83
+ intermediate_dim=4096,
84
+ moe_intermediate_dim=128,
85
+ num_experts=60,
86
+ top_k=4,
87
+ head_dim=128,
88
+ max_sequence_length=4096,
89
+ )
90
+ model(input_data)
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ vocabulary_size,
96
+ num_layers,
97
+ num_query_heads,
98
+ num_key_value_heads,
99
+ hidden_dim,
100
+ intermediate_dim,
101
+ moe_intermediate_dim,
102
+ num_experts,
103
+ head_dim=None,
104
+ top_k=4,
105
+ norm_top_k_prob=False,
106
+ decoder_sparse_step=1,
107
+ rope_max_wavelength=10000,
108
+ rope_scaling_factor=1.0,
109
+ layer_norm_epsilon=1e-6,
110
+ dropout=0,
111
+ dtype=None,
112
+ tie_word_embeddings=False,
113
+ sliding_window_size=32768,
114
+ router_aux_loss_coefficient=0.001,
115
+ mlp_only_layers=None,
116
+ training=None,
117
+ **kwargs,
118
+ ):
119
+ # === Layers ===
120
+ self.token_embedding = ReversibleEmbedding(
121
+ input_dim=vocabulary_size,
122
+ output_dim=hidden_dim,
123
+ tie_weights=tie_word_embeddings,
124
+ embeddings_initializer=_qwen3_moe_kernel_initializer(stddev=0.01),
125
+ dtype=dtype,
126
+ name="token_embedding",
127
+ )
128
+
129
+ if not mlp_only_layers:
130
+ mlp_only_layers = []
131
+
132
+ self.transformer_layers = []
133
+ for i in range(num_layers):
134
+ is_sparse_mlp = (
135
+ (i not in mlp_only_layers)
136
+ and num_experts > 0
137
+ and (i + 1) % decoder_sparse_step == 0
138
+ )
139
+ layer = Qwen3MoeTransformerDecoder(
140
+ intermediate_dim=intermediate_dim,
141
+ num_query_heads=num_query_heads,
142
+ num_key_value_heads=num_key_value_heads,
143
+ moe_intermediate_dim=moe_intermediate_dim,
144
+ head_dim=head_dim,
145
+ num_experts=num_experts,
146
+ top_k=top_k,
147
+ norm_top_k_prob=norm_top_k_prob,
148
+ rope_max_wavelength=rope_max_wavelength,
149
+ rope_scaling_factor=rope_scaling_factor,
150
+ layer_norm_epsilon=layer_norm_epsilon,
151
+ activation=ops.silu,
152
+ kernel_initializer=_qwen3_moe_kernel_initializer(stddev=0.02),
153
+ dropout=dropout,
154
+ dtype=dtype,
155
+ sliding_window_size=sliding_window_size,
156
+ router_aux_loss_coefficient=router_aux_loss_coefficient,
157
+ is_sparse_mlp=is_sparse_mlp,
158
+ name=f"transformer_layer_{i}",
159
+ )
160
+ self.transformer_layers.append(layer)
161
+ self.layer_norm = Qwen3MoeLayerNorm(
162
+ epsilon=layer_norm_epsilon,
163
+ dtype=dtype,
164
+ name="sequence_output_layernorm",
165
+ )
166
+
167
+ # === Functional Model ===
168
+ token_id_input = keras.Input(
169
+ shape=(None,), dtype="int32", name="token_ids"
170
+ )
171
+ padding_mask_input = keras.Input(
172
+ shape=(None,), dtype="int32", name="padding_mask"
173
+ )
174
+ x = self.token_embedding(token_id_input)
175
+ for transformer_layer in self.transformer_layers:
176
+ x = transformer_layer(
177
+ x, decoder_padding_mask=padding_mask_input, training=training
178
+ )
179
+ sequence_output = self.layer_norm(x)
180
+ super().__init__(
181
+ inputs={
182
+ "token_ids": token_id_input,
183
+ "padding_mask": padding_mask_input,
184
+ },
185
+ outputs=sequence_output,
186
+ dtype=dtype,
187
+ **kwargs,
188
+ )
189
+
190
+ # === Config ===
191
+ self.vocabulary_size = vocabulary_size
192
+ self.num_layers = num_layers
193
+ self.num_query_heads = num_query_heads
194
+ self.hidden_dim = hidden_dim
195
+ self.intermediate_dim = intermediate_dim
196
+ self.moe_intermediate_dim = moe_intermediate_dim
197
+ self.head_dim = head_dim
198
+ self.rope_max_wavelength = rope_max_wavelength
199
+ self.num_key_value_heads = num_key_value_heads
200
+ self.rope_scaling_factor = rope_scaling_factor
201
+ self.layer_norm_epsilon = layer_norm_epsilon
202
+ self.dropout = dropout
203
+ self.tie_word_embeddings = tie_word_embeddings
204
+ self.sliding_window_size = sliding_window_size
205
+ self.num_experts = num_experts
206
+ self.top_k = top_k
207
+ self.norm_top_k_prob = norm_top_k_prob
208
+ self.decoder_sparse_step = decoder_sparse_step
209
+ self.mlp_only_layers = mlp_only_layers
210
+ self.router_aux_loss_coefficient = router_aux_loss_coefficient
211
+
212
+ def get_config(self):
213
+ config = super().get_config()
214
+ config.update(
215
+ {
216
+ "vocabulary_size": self.vocabulary_size,
217
+ "num_layers": self.num_layers,
218
+ "num_query_heads": self.num_query_heads,
219
+ "head_dim": self.head_dim,
220
+ "hidden_dim": self.hidden_dim,
221
+ "intermediate_dim": self.intermediate_dim,
222
+ "moe_intermediate_dim": self.moe_intermediate_dim,
223
+ "rope_max_wavelength": self.rope_max_wavelength,
224
+ "num_key_value_heads": self.num_key_value_heads,
225
+ "rope_scaling_factor": self.rope_scaling_factor,
226
+ "layer_norm_epsilon": self.layer_norm_epsilon,
227
+ "dropout": self.dropout,
228
+ "tie_word_embeddings": self.tie_word_embeddings,
229
+ "sliding_window_size": self.sliding_window_size,
230
+ "num_experts": self.num_experts,
231
+ "top_k": self.top_k,
232
+ "norm_top_k_prob": self.norm_top_k_prob,
233
+ "decoder_sparse_step": self.decoder_sparse_step,
234
+ "mlp_only_layers": self.mlp_only_layers,
235
+ "router_aux_loss_coefficient": self.router_aux_loss_coefficient,
236
+ }
237
+ )
238
+ return config
239
+
240
+ @staticmethod
241
+ def get_layout_map(
242
+ device_mesh,
243
+ model_parallel_dim_name="model",
244
+ data_parallel_dim_name="batch",
245
+ ):
246
+ """Get a `keras.distribution.LayoutMap` for model parallel distribution.
247
+
248
+ The returned `LayoutMap` contains the sharding spec for the Qwen3Moe
249
+ backbone weights, so that you can use it to distribute weights across
250
+ the accelerators.
251
+
252
+ Example:
253
+ ```
254
+ # Feel free to change the mesh shape to balance data and model
255
+ # parallelism
256
+ mesh = keras.distribution.DeviceMesh(
257
+ shape=(1, 8),
258
+ axis_names=('batch', 'model'),
259
+ devices=keras.distribution.list_devices(),
260
+ )
261
+ layout_map = Qwen3MoeBackbone.get_layout_map(
262
+ mesh,
263
+ model_parallel_dim_name="model",
264
+ )
265
+
266
+ distribution = keras.distribution.ModelParallel(
267
+ layout_map=layout_map,
268
+ batch_dim_name='batch',
269
+ )
270
+
271
+ with distribution.scope():
272
+ qwen3_moe_model = keras_hub.models.Qwen3MoeBackbone.from_preset()
273
+ ```
274
+
275
+ To see how the layout map was applied, load the model then run
276
+ (for one decoder block):
277
+ ```
278
+ embedding_layer = qwen3_moe_model.backbone.get_layer("token_embedding")
279
+ decoder_block_1 = qwen3_moe_model.backbone.get_layer(
280
+ 'transformer_layer_0'
281
+ )
282
+ for variable in embedding_layer.weights + decoder_block_1.weights:
283
+ print(
284
+ f'{variable.path:<58} {str(variable.shape):<16} '
285
+ f'{str(variable.value.sharding.spec)}'
286
+ )
287
+ ```
288
+
289
+ Args:
290
+ device_mesh: The `keras.distribution.DeviceMesh` instance for
291
+ distribution.
292
+ model_parallel_dim_name: The axis name of the device mesh, where
293
+ the weights should be partition on.
294
+ data_parallel_dim_name: The axis name of the device mesh, where
295
+ the data should be partition on.
296
+ Return:
297
+ `keras.distribution.LayoutMap` that contains the sharding spec
298
+ for all the model weights.
299
+ """
300
+ # The weight path and shape of the Llama backbone is like below
301
+ # token_embedding/embeddings (128256, 2048)
302
+ # repeat block for decoder
303
+ # transformer_layer_0/self_attention/query/kernel (2048, 32, 64)
304
+ # transformer_layer_0/self_attention/key/kernel (2048, 8, 64)
305
+ # transformer_layer_0/self_attention/value/kernel (2048, 8, 64)
306
+ # transformer_layer_0/self_attention/attention_output/kernel
307
+ # (32, 64, 2048)
308
+ # transformer_layer_0/self_attention_layernorm/scale (2048,)
309
+ # transformer_layer_0/feedforward_intermediate_dense/kernel
310
+ # (2048, 8192)
311
+ # transformer_layer_0/feedforward_gate_dense/kernel (2048, 8192)
312
+ # transformer_layer_0/feedforward_output_dense/kerne (8192, 2048)
313
+ # transformer_layer_0/feedforward_layernorm/scale (2048,)
314
+
315
+ if not isinstance(device_mesh, keras.distribution.DeviceMesh):
316
+ raise ValueError(
317
+ "Invalid device_mesh type. Expected "
318
+ f"`keras.distribution.Device`, got {type(device_mesh)}"
319
+ )
320
+ if model_parallel_dim_name not in device_mesh.axis_names:
321
+ raise ValueError(
322
+ f"{model_parallel_dim_name} is not found in the "
323
+ f"device_mesh.axis_names. {device_mesh.axis_name=}"
324
+ )
325
+ if data_parallel_dim_name not in device_mesh.axis_names:
326
+ raise ValueError(
327
+ f"{data_parallel_dim_name} is not found in the "
328
+ f"device_mesh.axis_names. {device_mesh.axis_name=}"
329
+ )
330
+ # Note that it is possible to further config the mesh to be 3D, eg
331
+ # (data, seq, model). We leave it as 2D for now for simplicity.
332
+ data_dim = data_parallel_dim_name
333
+ model_dim = model_parallel_dim_name
334
+ # The sharding config is based on the Gemma team training config.
335
+ # See https://arxiv.org/abs/2403.08295
336
+ layout_map = keras.distribution.LayoutMap(device_mesh)
337
+ layout_map["token_embedding/embeddings"] = (model_dim, data_dim)
338
+ layout_map[
339
+ "transformer_layer.*self_attention.*(query|key|value).kernel"
340
+ ] = (
341
+ model_dim,
342
+ data_dim,
343
+ None,
344
+ )
345
+ layout_map["transformer_layer.*attention_output.kernel"] = (
346
+ model_dim,
347
+ None,
348
+ data_dim,
349
+ )
350
+ layout_map[
351
+ "transformer_layer.*feedforward_intermediate_dense.kernel"
352
+ ] = (
353
+ data_dim,
354
+ model_dim,
355
+ )
356
+ layout_map["transformer_layer.*feedforward_gate_dense.kernel"] = (
357
+ data_dim,
358
+ model_dim,
359
+ )
360
+ layout_map["transformer_layer.*feedforward_output_dense.kernel"] = (
361
+ model_dim,
362
+ data_dim,
363
+ )
364
+
365
+ return layout_map