keras-hub 0.20.0.dev1__py3-none-any.whl → 0.21.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/__init__.py +15 -33
- keras_hub/layers/__init__.py +134 -0
- keras_hub/metrics/__init__.py +11 -0
- keras_hub/models/__init__.py +642 -0
- keras_hub/samplers/__init__.py +18 -0
- keras_hub/src/layers/modeling/reversible_embedding.py +25 -35
- keras_hub/src/layers/preprocessing/image_converter.py +1 -0
- keras_hub/src/layers/preprocessing/random_deletion.py +1 -1
- keras_hub/src/layers/preprocessing/random_swap.py +1 -1
- keras_hub/src/models/audio_to_text.py +66 -0
- keras_hub/src/models/audio_to_text_preprocessor.py +80 -0
- keras_hub/src/models/backbone.py +5 -2
- keras_hub/src/models/cspnet/cspnet_backbone.py +51 -26
- keras_hub/src/models/cspnet/cspnet_presets.py +38 -3
- keras_hub/src/models/falcon/falcon_backbone.py +1 -1
- keras_hub/src/models/gemma/gemma_presets.py +10 -10
- keras_hub/src/models/gemma3/gemma3_causal_lm_preprocessor.py +3 -2
- keras_hub/src/models/gemma3/gemma3_presets.py +8 -8
- keras_hub/src/models/gemma3/gemma3_vision_encoder.py +1 -1
- keras_hub/src/models/llama/llama_attention.py +24 -6
- keras_hub/src/models/llama/llama_backbone.py +50 -16
- keras_hub/src/models/llama/llama_decoder.py +20 -3
- keras_hub/src/models/llama/llama_presets.py +3 -3
- keras_hub/src/models/llama/llama_rotary_embedding.py +180 -0
- keras_hub/src/models/llama3/llama3_backbone.py +10 -2
- keras_hub/src/models/llama3/llama3_presets.py +84 -2
- keras_hub/src/models/mistral/mistral_presets.py +3 -3
- keras_hub/src/models/mixtral/__init__.py +5 -0
- keras_hub/src/models/mixtral/mixtral_attention.py +252 -0
- keras_hub/src/models/mixtral/mixtral_backbone.py +207 -0
- keras_hub/src/models/mixtral/mixtral_causal_lm.py +281 -0
- keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py +76 -0
- keras_hub/src/models/mixtral/mixtral_decoder.py +494 -0
- keras_hub/src/models/mixtral/mixtral_layer_norm.py +34 -0
- keras_hub/src/models/mixtral/mixtral_presets.py +26 -0
- keras_hub/src/models/mixtral/mixtral_tokenizer.py +21 -0
- keras_hub/src/models/moonshine/__init__.py +5 -0
- keras_hub/src/models/moonshine/moonshine_audio_converter.py +301 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text.py +383 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py +272 -0
- keras_hub/src/models/moonshine/moonshine_backbone.py +478 -0
- keras_hub/src/models/moonshine/moonshine_decoder.py +313 -0
- keras_hub/src/models/moonshine/moonshine_encoder.py +212 -0
- keras_hub/src/models/moonshine/moonshine_layers.py +239 -0
- keras_hub/src/models/moonshine/moonshine_multi_head_attention.py +355 -0
- keras_hub/src/models/moonshine/moonshine_presets.py +25 -0
- keras_hub/src/models/moonshine/moonshine_tokenizer.py +62 -0
- keras_hub/src/models/pali_gemma/pali_gemma_presets.py +11 -11
- keras_hub/src/models/pali_gemma/pali_gemma_vit.py +1 -1
- keras_hub/src/models/qwen/__init__.py +4 -0
- keras_hub/src/models/qwen/qwen_attention.py +3 -1
- keras_hub/src/models/qwen/qwen_backbone.py +8 -1
- keras_hub/src/models/qwen/qwen_causal_lm.py +7 -0
- keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py +7 -0
- keras_hub/src/models/qwen/qwen_presets.py +61 -0
- keras_hub/src/models/qwen/qwen_tokenizer.py +9 -0
- keras_hub/src/models/qwen_moe/__init__.py +5 -0
- keras_hub/src/models/qwen_moe/qwen_moe_attention.py +375 -0
- keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +373 -0
- keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py +350 -0
- keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py +17 -0
- keras_hub/src/models/qwen_moe/qwen_moe_decoder.py +625 -0
- keras_hub/src/models/qwen_moe/qwen_moe_layernorm.py +32 -0
- keras_hub/src/models/qwen_moe/qwen_moe_presets.py +15 -0
- keras_hub/src/models/qwen_moe/qwen_moe_tokenizer.py +46 -0
- keras_hub/src/models/retinanet/retinanet_image_converter.py +0 -13
- keras_hub/src/models/retinanet/retinanet_presets.py +2 -2
- keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +0 -18
- keras_hub/src/models/segformer/segformer_presets.py +12 -12
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +6 -0
- keras_hub/src/models/task.py +5 -2
- keras_hub/src/models/xception/__init__.py +5 -0
- keras_hub/src/models/xception/xception_backbone.py +188 -0
- keras_hub/src/models/xception/xception_image_classifier.py +12 -0
- keras_hub/src/models/xception/xception_image_classifier_preprocessor.py +14 -0
- keras_hub/src/models/xception/xception_image_converter.py +8 -0
- keras_hub/src/models/xception/xception_presets.py +14 -0
- keras_hub/src/tests/mocks/mock_gemma3_tokenizer.py +155 -0
- keras_hub/src/utils/coco/__init__.py +0 -0
- keras_hub/src/utils/coco/coco_utils.py +133 -0
- keras_hub/src/utils/imagenet/imagenet_utils.py +36 -0
- keras_hub/src/utils/keras_utils.py +11 -0
- keras_hub/src/utils/preset_utils.py +70 -10
- keras_hub/src/utils/tensor_utils.py +27 -1
- keras_hub/src/utils/timm/convert_cspnet.py +94 -23
- keras_hub/src/utils/timm/preset_loader.py +6 -6
- keras_hub/src/utils/transformers/convert_llama3.py +21 -1
- keras_hub/src/utils/transformers/convert_mixtral.py +139 -0
- keras_hub/src/utils/transformers/convert_qwen.py +1 -0
- keras_hub/src/utils/transformers/convert_qwen_moe.py +253 -0
- keras_hub/src/utils/transformers/preset_loader.py +6 -0
- keras_hub/src/{version_utils.py → version.py} +1 -1
- keras_hub/tokenizers/__init__.py +117 -0
- keras_hub/utils/__init__.py +21 -0
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/METADATA +6 -20
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/RECORD +98 -55
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/WHEEL +1 -1
- keras_hub/api/__init__.py +0 -15
- keras_hub/api/layers/__init__.py +0 -86
- keras_hub/api/metrics/__init__.py +0 -11
- keras_hub/api/models/__init__.py +0 -416
- keras_hub/api/samplers/__init__.py +0 -16
- keras_hub/api/tokenizers/__init__.py +0 -58
- keras_hub/api/utils/__init__.py +0 -9
- {keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import keras
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
import tensorflow as tf
|
|
5
|
+
except ImportError:
|
|
6
|
+
tf = None
|
|
7
|
+
from keras_hub.src.api_export import keras_hub_export
|
|
8
|
+
from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
|
|
9
|
+
from keras_hub.src.models.audio_to_text_preprocessor import (
|
|
10
|
+
AudioToTextPreprocessor,
|
|
11
|
+
)
|
|
12
|
+
from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
|
|
13
|
+
from keras_hub.src.models.moonshine.moonshine_tokenizer import (
|
|
14
|
+
MoonshineTokenizer,
|
|
15
|
+
)
|
|
16
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@keras_hub_export("keras_hub.models.MoonshineAudioToTextPreprocessor")
|
|
20
|
+
class MoonshineAudioToTextPreprocessor(AudioToTextPreprocessor):
|
|
21
|
+
"""Moonshine Seq2Seq LM preprocessor for audio-to-text tasks.
|
|
22
|
+
|
|
23
|
+
This preprocessor converts raw audio and text inputs into a format suitable
|
|
24
|
+
for the `MoonshineAudioToText` model. It processes audio waveforms using
|
|
25
|
+
`MoonshineAudioConverter` for basic preprocessing (padding, normalization)
|
|
26
|
+
and tokenizes text using `MoonshineTokenizer` for the decoder. It supports
|
|
27
|
+
training and generation.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
audio_converter: A `MoonshineAudioConverter` instance to process audio.
|
|
31
|
+
tokenizer: A `MoonshineTokenizer` instance to tokenize text.
|
|
32
|
+
decoder_sequence_length: int, optional. Maximum length for decoder token
|
|
33
|
+
sequences. Defaults to 1024.
|
|
34
|
+
**kwargs: Additional keyword arguments for the parent class.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
```python
|
|
38
|
+
import keras
|
|
39
|
+
from keras_hub.layers import MoonshineAudioConverter
|
|
40
|
+
from keras_hub.models import MoonshineTokenizer
|
|
41
|
+
|
|
42
|
+
# Create audio converter and tokenizer instances.
|
|
43
|
+
audio_converter = MoonshineAudioConverter()
|
|
44
|
+
tokenizer = MoonshineTokenizer.from_preset("moonshine_base")
|
|
45
|
+
|
|
46
|
+
# Initialize the preprocessor.
|
|
47
|
+
preprocessor = keras_hub.models.MoonshineAudioToTextPreprocessor(
|
|
48
|
+
audio_converter=audio_converter,
|
|
49
|
+
tokenizer=tokenizer,
|
|
50
|
+
decoder_sequence_length=8
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Prepare input data (audio tensor and text).
|
|
54
|
+
inputs = {
|
|
55
|
+
"audio": keras.random.normal((1, 16000)),
|
|
56
|
+
"text": ["the quick brown fox"]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Process the inputs for training.
|
|
60
|
+
x, y, sample_weight = preprocessor(inputs)
|
|
61
|
+
|
|
62
|
+
# Check output keys and shapes (shapes depend on padding/truncation).
|
|
63
|
+
print(x.keys())
|
|
64
|
+
# dict_keys(['encoder_input_values', 'encoder_padding_mask',
|
|
65
|
+
# 'decoder_token_ids', 'decoder_padding_mask']).
|
|
66
|
+
print(x["encoder_input_values"].shape) # e.g., (1, 16000, 1) / padded length
|
|
67
|
+
print(x["encoder_padding_mask"].shape) # e.g., (1, 16000) or padded length
|
|
68
|
+
print(x["decoder_token_ids"].shape) # (1, 8)
|
|
69
|
+
print(x["decoder_padding_mask"].shape) # (1, 8)
|
|
70
|
+
print(y.shape) # (1, 8) - Labels
|
|
71
|
+
print(sample_weight.shape) # (1, 8) - Sample weights
|
|
72
|
+
|
|
73
|
+
# Process inputs for generation.
|
|
74
|
+
gen_inputs = preprocessor.generate_preprocess(inputs)
|
|
75
|
+
print(gen_inputs.keys())
|
|
76
|
+
# dict_keys(['encoder_input_values', 'encoder_padding_mask',
|
|
77
|
+
# 'decoder_token_ids', 'decoder_padding_mask']).
|
|
78
|
+
```
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
backbone_cls = MoonshineBackbone
|
|
82
|
+
tokenizer_cls = MoonshineTokenizer
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
audio_converter,
|
|
87
|
+
tokenizer,
|
|
88
|
+
decoder_sequence_length=1024,
|
|
89
|
+
**kwargs,
|
|
90
|
+
):
|
|
91
|
+
super().__init__(tokenizer=tokenizer, **kwargs)
|
|
92
|
+
self.audio_converter = audio_converter
|
|
93
|
+
self.decoder_sequence_length = decoder_sequence_length
|
|
94
|
+
self.decoder_packer = None
|
|
95
|
+
self._special_token_ids_set = None
|
|
96
|
+
|
|
97
|
+
def build(self, input_shape):
|
|
98
|
+
self.decoder_packer = StartEndPacker(
|
|
99
|
+
start_value=self.tokenizer.start_token_id,
|
|
100
|
+
end_value=self.tokenizer.end_token_id,
|
|
101
|
+
pad_value=self.tokenizer.pad_token_id,
|
|
102
|
+
sequence_length=self.decoder_sequence_length,
|
|
103
|
+
return_padding_mask=True,
|
|
104
|
+
)
|
|
105
|
+
self._special_token_ids_set = set(self.tokenizer.special_token_ids)
|
|
106
|
+
if self.tokenizer.pad_token_id is not None:
|
|
107
|
+
self._special_token_ids_set.add(self.tokenizer.pad_token_id)
|
|
108
|
+
self.built = True
|
|
109
|
+
|
|
110
|
+
@preprocessing_function
|
|
111
|
+
def call(
|
|
112
|
+
self,
|
|
113
|
+
x,
|
|
114
|
+
y=None,
|
|
115
|
+
sample_weight=None,
|
|
116
|
+
decoder_sequence_length=None,
|
|
117
|
+
sequence_length=None,
|
|
118
|
+
):
|
|
119
|
+
if not self.built:
|
|
120
|
+
self.build(None)
|
|
121
|
+
if isinstance(x, tuple) and len(x) == 1:
|
|
122
|
+
x = x[0]
|
|
123
|
+
decoder_sequence_length = (
|
|
124
|
+
decoder_sequence_length
|
|
125
|
+
or sequence_length
|
|
126
|
+
or self.decoder_sequence_length
|
|
127
|
+
)
|
|
128
|
+
text = x["text"]
|
|
129
|
+
encoder_inputs = self.audio_converter(
|
|
130
|
+
x["audio"],
|
|
131
|
+
padding="longest",
|
|
132
|
+
)
|
|
133
|
+
encoder_inputs_shape = keras.ops.shape(encoder_inputs)
|
|
134
|
+
if len(encoder_inputs_shape) == 2:
|
|
135
|
+
encoder_inputs = keras.ops.expand_dims(encoder_inputs, axis=-1)
|
|
136
|
+
squeezed_inputs = encoder_inputs[:, :, 0]
|
|
137
|
+
is_tf_symbolic = (
|
|
138
|
+
tf is not None
|
|
139
|
+
and hasattr(squeezed_inputs, "graph")
|
|
140
|
+
and hasattr(squeezed_inputs.graph, "as_graph_def")
|
|
141
|
+
)
|
|
142
|
+
if is_tf_symbolic and keras.config.backend() != "tensorflow":
|
|
143
|
+
encoder_padding_mask = tf.logical_not(
|
|
144
|
+
tf.math.equal(
|
|
145
|
+
squeezed_inputs, float(self.audio_converter.padding_value)
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
encoder_padding_mask = keras.ops.logical_not(
|
|
150
|
+
keras.ops.equal(
|
|
151
|
+
squeezed_inputs, self.audio_converter.padding_value
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
decoder_inputs = self.tokenizer(text)
|
|
155
|
+
decoder_token_ids, decoder_padding_mask = self.decoder_packer(
|
|
156
|
+
decoder_inputs,
|
|
157
|
+
sequence_length=decoder_sequence_length + 1,
|
|
158
|
+
add_end_value=True,
|
|
159
|
+
)
|
|
160
|
+
x_out = {
|
|
161
|
+
"encoder_input_values": encoder_inputs,
|
|
162
|
+
"encoder_padding_mask": encoder_padding_mask,
|
|
163
|
+
"decoder_token_ids": decoder_token_ids[..., :-1],
|
|
164
|
+
"decoder_padding_mask": decoder_padding_mask[..., :-1],
|
|
165
|
+
}
|
|
166
|
+
y_out = decoder_token_ids[..., 1:]
|
|
167
|
+
sample_weight_out = decoder_padding_mask[..., 1:]
|
|
168
|
+
|
|
169
|
+
return keras.utils.pack_x_y_sample_weight(
|
|
170
|
+
x_out, y_out, sample_weight_out
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@preprocessing_function
|
|
174
|
+
def generate_preprocess(
|
|
175
|
+
self,
|
|
176
|
+
x,
|
|
177
|
+
decoder_sequence_length=None,
|
|
178
|
+
sequence_length=None,
|
|
179
|
+
):
|
|
180
|
+
if not self.built:
|
|
181
|
+
self.build(None)
|
|
182
|
+
if isinstance(x, tuple) and len(x) == 1:
|
|
183
|
+
x = x[0]
|
|
184
|
+
decoder_sequence_length = (
|
|
185
|
+
decoder_sequence_length
|
|
186
|
+
or sequence_length
|
|
187
|
+
or self.decoder_sequence_length
|
|
188
|
+
)
|
|
189
|
+
encoder_inputs = self.audio_converter(
|
|
190
|
+
x["audio"],
|
|
191
|
+
padding="longest",
|
|
192
|
+
)
|
|
193
|
+
encoder_inputs_shape = keras.ops.shape(encoder_inputs)
|
|
194
|
+
if len(encoder_inputs_shape) == 2:
|
|
195
|
+
encoder_inputs = keras.ops.expand_dims(encoder_inputs, axis=-1)
|
|
196
|
+
squeezed_inputs = encoder_inputs[:, :, 0]
|
|
197
|
+
is_tf_symbolic = (
|
|
198
|
+
tf is not None
|
|
199
|
+
and hasattr(squeezed_inputs, "graph")
|
|
200
|
+
and hasattr(squeezed_inputs.graph, "as_graph_def")
|
|
201
|
+
)
|
|
202
|
+
if is_tf_symbolic and keras.config.backend() != "tensorflow":
|
|
203
|
+
encoder_padding_mask = tf.logical_not(
|
|
204
|
+
tf.math.equal(
|
|
205
|
+
squeezed_inputs, float(self.audio_converter.padding_value)
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
else:
|
|
209
|
+
encoder_padding_mask = keras.ops.logical_not(
|
|
210
|
+
keras.ops.equal(
|
|
211
|
+
squeezed_inputs, self.audio_converter.padding_value
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
audio_batch_size = keras.ops.shape(x["audio"])[0]
|
|
215
|
+
decoder_text = x.get("text", None)
|
|
216
|
+
if decoder_text is None:
|
|
217
|
+
decoder_token_ids = [
|
|
218
|
+
[self.tokenizer.start_token_id]
|
|
219
|
+
] * audio_batch_size
|
|
220
|
+
else:
|
|
221
|
+
if isinstance(decoder_text, str):
|
|
222
|
+
decoder_text = [decoder_text] * audio_batch_size
|
|
223
|
+
elif len(decoder_text) != audio_batch_size:
|
|
224
|
+
if len(decoder_text) == 1:
|
|
225
|
+
decoder_text = decoder_text * audio_batch_size
|
|
226
|
+
else:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
f"Batch size mismatch between audio "
|
|
229
|
+
f"({audio_batch_size}) and text prompts "
|
|
230
|
+
f"({len(decoder_text)})"
|
|
231
|
+
)
|
|
232
|
+
decoder_token_ids = self.tokenizer(decoder_text)
|
|
233
|
+
decoder_token_ids, decoder_padding_mask = self.decoder_packer(
|
|
234
|
+
decoder_token_ids,
|
|
235
|
+
sequence_length=decoder_sequence_length,
|
|
236
|
+
add_end_value=False,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
"encoder_input_values": encoder_inputs,
|
|
241
|
+
"encoder_padding_mask": encoder_padding_mask,
|
|
242
|
+
"decoder_token_ids": decoder_token_ids,
|
|
243
|
+
"decoder_padding_mask": decoder_padding_mask,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
@preprocessing_function
|
|
247
|
+
def generate_postprocess(self, x):
|
|
248
|
+
if not self.built:
|
|
249
|
+
self.build(None)
|
|
250
|
+
token_ids, padding_mask = (
|
|
251
|
+
x["decoder_token_ids"],
|
|
252
|
+
x["decoder_padding_mask"],
|
|
253
|
+
)
|
|
254
|
+
token_ids_np = keras.ops.convert_to_numpy(token_ids)
|
|
255
|
+
padding_mask_np = keras.ops.convert_to_numpy(padding_mask)
|
|
256
|
+
vocab_size = self.tokenizer.vocabulary_size()
|
|
257
|
+
processed_sequences = []
|
|
258
|
+
for i in range(token_ids_np.shape[0]):
|
|
259
|
+
sequence = token_ids_np[i]
|
|
260
|
+
mask = padding_mask_np[i].astype(bool)
|
|
261
|
+
valid_tokens = sequence[mask]
|
|
262
|
+
filtered_tokens = [
|
|
263
|
+
int(token)
|
|
264
|
+
for token in valid_tokens
|
|
265
|
+
if token not in self._special_token_ids_set
|
|
266
|
+
and 0 <= token < vocab_size
|
|
267
|
+
]
|
|
268
|
+
processed_sequences.append(filtered_tokens)
|
|
269
|
+
processed_sequences = tf.ragged.constant(
|
|
270
|
+
processed_sequences, dtype=tf.int32
|
|
271
|
+
)
|
|
272
|
+
return self.tokenizer.detokenize(processed_sequences)
|