keras-hub-nightly 0.21.0.dev202505130407__py3-none-any.whl → 0.21.0.dev202505150407__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +3 -0
- keras_hub/models/__init__.py +12 -0
- keras_hub/src/models/moonshine/__init__.py +0 -0
- keras_hub/src/models/moonshine/moonshine_audio_converter.py +301 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text.py +383 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py +267 -0
- keras_hub/src/models/moonshine/moonshine_backbone.py +478 -0
- keras_hub/src/models/moonshine/moonshine_decoder.py +313 -0
- keras_hub/src/models/moonshine/moonshine_encoder.py +212 -0
- keras_hub/src/models/moonshine/moonshine_layers.py +239 -0
- keras_hub/src/models/moonshine/moonshine_multi_head_attention.py +355 -0
- keras_hub/src/models/moonshine/moonshine_presets.py +25 -0
- keras_hub/src/models/moonshine/moonshine_tokenizer.py +62 -0
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.21.0.dev202505130407.dist-info → keras_hub_nightly-0.21.0.dev202505150407.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.21.0.dev202505130407.dist-info → keras_hub_nightly-0.21.0.dev202505150407.dist-info}/RECORD +19 -8
- {keras_hub_nightly-0.21.0.dev202505130407.dist-info → keras_hub_nightly-0.21.0.dev202505150407.dist-info}/WHEEL +1 -1
- {keras_hub_nightly-0.21.0.dev202505130407.dist-info → keras_hub_nightly-0.21.0.dev202505150407.dist-info}/top_level.txt +0 -0
keras_hub/layers/__init__.py
CHANGED
@@ -93,6 +93,9 @@ from keras_hub.src.models.mit.mit_image_converter import (
|
|
93
93
|
from keras_hub.src.models.mobilenet.mobilenet_image_converter import (
|
94
94
|
MobileNetImageConverter as MobileNetImageConverter,
|
95
95
|
)
|
96
|
+
from keras_hub.src.models.moonshine.moonshine_audio_converter import (
|
97
|
+
MoonshineAudioConverter as MoonshineAudioConverter,
|
98
|
+
)
|
96
99
|
from keras_hub.src.models.pali_gemma.pali_gemma_image_converter import (
|
97
100
|
PaliGemmaImageConverter as PaliGemmaImageConverter,
|
98
101
|
)
|
keras_hub/models/__init__.py
CHANGED
@@ -369,6 +369,18 @@ from keras_hub.src.models.mobilenet.mobilenet_image_classifier import (
|
|
369
369
|
from keras_hub.src.models.mobilenet.mobilenet_image_classifier_preprocessor import (
|
370
370
|
MobileNetImageClassifierPreprocessor as MobileNetImageClassifierPreprocessor,
|
371
371
|
)
|
372
|
+
from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
|
373
|
+
MoonshineAudioToText as MoonshineAudioToText,
|
374
|
+
)
|
375
|
+
from keras_hub.src.models.moonshine.moonshine_audio_to_text_preprocessor import (
|
376
|
+
MoonshineAudioToTextPreprocessor as MoonshineAudioToTextPreprocessor,
|
377
|
+
)
|
378
|
+
from keras_hub.src.models.moonshine.moonshine_backbone import (
|
379
|
+
MoonshineBackbone as MoonshineBackbone,
|
380
|
+
)
|
381
|
+
from keras_hub.src.models.moonshine.moonshine_tokenizer import (
|
382
|
+
MoonshineTokenizer as MoonshineTokenizer,
|
383
|
+
)
|
372
384
|
from keras_hub.src.models.object_detector import (
|
373
385
|
ObjectDetector as ImageObjectDetector,
|
374
386
|
)
|
File without changes
|
@@ -0,0 +1,301 @@
|
|
1
|
+
import keras
|
2
|
+
|
3
|
+
try:
|
4
|
+
import tensorflow as tf
|
5
|
+
except ImportError:
|
6
|
+
tf = None
|
7
|
+
|
8
|
+
from keras_hub.src.api_export import keras_hub_export
|
9
|
+
from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
|
10
|
+
from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
|
11
|
+
|
12
|
+
|
13
|
+
@keras_hub_export("keras_hub.layers.MoonshineAudioConverter")
|
14
|
+
class MoonshineAudioConverter(AudioConverter):
|
15
|
+
"""Moonshine audio preprocessing layer.
|
16
|
+
|
17
|
+
This layer processes raw audio waveforms for the Moonshine ASR model. Audio
|
18
|
+
is formatted as a batched tensor at a 16kHz sample rate and validated for
|
19
|
+
length (0.1 to 64 seconds). The layer handles padding and optional
|
20
|
+
normalization. It does not contain trainable weights.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
sampling_rate: int, optional. The audio sampling rate in Hz. Defaults to
|
24
|
+
16,000.
|
25
|
+
padding_value: float, optional. The value for padding. Defaults to 0.0.
|
26
|
+
do_normalize: bool, optional. Whether to normalize inputs. Defaults to
|
27
|
+
False.
|
28
|
+
**kwargs: Additional keyword arguments passed to the base AudioConverter
|
29
|
+
class for customizing the underlying preprocessing behavior.
|
30
|
+
|
31
|
+
Call arguments:
|
32
|
+
- `inputs`: The raw audio data to be processed. It should be a tensor of
|
33
|
+
shape `(batch_size, time_steps, 1)` for mono audio. If the input has
|
34
|
+
shape `(batch_size, time_steps)`, the layer will add the channel
|
35
|
+
dimension.
|
36
|
+
- `sampling_rate`: The sampling rate of the audio in Hz. If
|
37
|
+
provided, it must match the expected sampling rate set during
|
38
|
+
initialization (default is 16,000 Hz). If not provided, the expected
|
39
|
+
sampling rate is taken from the initialization arguments.
|
40
|
+
- `padding`: The padding strategy to apply. If provided, can be one of:
|
41
|
+
- `"longest"`: If `pad_to_multiple_of` is set, pads the audio to
|
42
|
+
make the time_steps dimension a multiple of `pad_to_multiple_of`.
|
43
|
+
- `"max_length"`: Pads or truncates the audio to `max_length` time
|
44
|
+
steps. If `pad_to_multiple_of` is set, the target length will be
|
45
|
+
the smallest multiple of `pad_to_multiple_of` that is greater than
|
46
|
+
or equal to `max_length`.
|
47
|
+
- If not specified or `None`, no padding is applied.
|
48
|
+
- `max_length`: The target number of time steps when `padding` is
|
49
|
+
`"max_length"`. If not provided and `padding` is `"max_length"`, no
|
50
|
+
padding or truncation is applied.
|
51
|
+
- `pad_to_multiple_of`: If set, the padded time_steps will be a
|
52
|
+
multiple of this value for the chosen padding strategy.
|
53
|
+
|
54
|
+
Examples:
|
55
|
+
```python
|
56
|
+
import keras
|
57
|
+
from keras_hub.layers import MoonshineAudioConverter
|
58
|
+
|
59
|
+
# Create a dummy audio input (1 second at 16kHz).
|
60
|
+
dummy_audio = keras.ops.convert_to_tensor(
|
61
|
+
[[0.1] * 16000],
|
62
|
+
dtype="float32"
|
63
|
+
)
|
64
|
+
dummy_audio = keras.ops.expand_dims(dummy_audio, axis=-1)
|
65
|
+
|
66
|
+
# Initialize the preprocessor.
|
67
|
+
preprocessor = MoonshineAudioConverter(do_normalize=True)
|
68
|
+
|
69
|
+
# Process the audio.
|
70
|
+
processed_audio = preprocessor(dummy_audio)
|
71
|
+
|
72
|
+
# Output shape.
|
73
|
+
print(processed_audio.shape) # Expected: (1, 16000, 1) or padded length
|
74
|
+
```
|
75
|
+
"""
|
76
|
+
|
77
|
+
# References:
|
78
|
+
# Defined and formulated based on the UsefulSensors implementation of audio
|
79
|
+
# preprocessing logic (https://github.com/usefulsensors/moonshine/blob/main/moonshine/transcribe.py).
|
80
|
+
|
81
|
+
backbone_cls = MoonshineBackbone
|
82
|
+
|
83
|
+
def __init__(
|
84
|
+
self,
|
85
|
+
sampling_rate=16000,
|
86
|
+
padding_value=0.0,
|
87
|
+
do_normalize=False,
|
88
|
+
**kwargs,
|
89
|
+
):
|
90
|
+
super().__init__(**kwargs)
|
91
|
+
self._convert_input_args = False
|
92
|
+
self._allow_non_tensor_positional_args = True
|
93
|
+
self.sampling_rate = sampling_rate
|
94
|
+
self.padding_value = padding_value
|
95
|
+
self.do_normalize = do_normalize
|
96
|
+
|
97
|
+
def call(
|
98
|
+
self,
|
99
|
+
inputs,
|
100
|
+
sampling_rate=None,
|
101
|
+
padding=None,
|
102
|
+
max_length=None,
|
103
|
+
pad_to_multiple_of=None,
|
104
|
+
):
|
105
|
+
# Validate sampling rate.
|
106
|
+
if sampling_rate is not None and sampling_rate != self.sampling_rate:
|
107
|
+
raise ValueError(
|
108
|
+
f"Expected sampling_rate {self.sampling_rate}, got "
|
109
|
+
f"{sampling_rate}"
|
110
|
+
)
|
111
|
+
|
112
|
+
# Ensure inputs are (batch_size, time_steps, 1).
|
113
|
+
input_shape = keras.ops.shape(inputs)
|
114
|
+
input_rank = len(input_shape)
|
115
|
+
if input_rank == 2:
|
116
|
+
processed_inputs = keras.ops.expand_dims(inputs, axis=-1)
|
117
|
+
elif input_rank == 3:
|
118
|
+
processed_inputs = inputs
|
119
|
+
else:
|
120
|
+
raise ValueError(
|
121
|
+
"Inputs must be mono audio: (batch_size, time_steps, 1)"
|
122
|
+
)
|
123
|
+
|
124
|
+
# Get original length and validate duration.
|
125
|
+
current_shape = keras.ops.shape(processed_inputs)
|
126
|
+
original_length = current_shape[1]
|
127
|
+
duration = (
|
128
|
+
keras.ops.cast(original_length, keras.backend.floatx())
|
129
|
+
/ self.sampling_rate
|
130
|
+
)
|
131
|
+
# Source: https://github.com/usefulsensors/moonshine/blob/4a000427bd36a1c2c6d20a86c672dbd850b44c88/moonshine/transcribe.py#L20
|
132
|
+
is_invalid_duration = keras.ops.logical_or(
|
133
|
+
keras.ops.less(duration, 0.1), keras.ops.greater(duration, 64.0)
|
134
|
+
)
|
135
|
+
|
136
|
+
def print_warning_fn():
|
137
|
+
import warnings
|
138
|
+
|
139
|
+
warnings.warn(
|
140
|
+
"Audio duration must be between 0.1 and 64 seconds. For "
|
141
|
+
"transcribing longer segments, pre-segment your audio and "
|
142
|
+
"provide shorter segments."
|
143
|
+
)
|
144
|
+
return keras.ops.convert_to_tensor(True, dtype="bool")
|
145
|
+
|
146
|
+
is_tf_symbolic = (
|
147
|
+
tf is not None
|
148
|
+
and hasattr(processed_inputs, "graph")
|
149
|
+
and hasattr(processed_inputs.graph, "as_graph_def")
|
150
|
+
)
|
151
|
+
use_tf_graph_ops = tf is not None and is_tf_symbolic
|
152
|
+
if use_tf_graph_ops and keras.config.backend() != "torch":
|
153
|
+
_ = tf.cond(
|
154
|
+
is_invalid_duration,
|
155
|
+
print_warning_fn,
|
156
|
+
lambda: keras.ops.convert_to_tensor(False, dtype="bool"),
|
157
|
+
)
|
158
|
+
else:
|
159
|
+
if keras.ops.convert_to_numpy(is_invalid_duration):
|
160
|
+
print_warning_fn()
|
161
|
+
|
162
|
+
# Handle padding.
|
163
|
+
if padding == "longest":
|
164
|
+
target_length = original_length
|
165
|
+
if pad_to_multiple_of:
|
166
|
+
target_length = (
|
167
|
+
(target_length + pad_to_multiple_of - 1)
|
168
|
+
// pad_to_multiple_of
|
169
|
+
) * pad_to_multiple_of
|
170
|
+
|
171
|
+
needs_padding = keras.ops.greater(target_length, original_length)
|
172
|
+
|
173
|
+
def pad_fn():
|
174
|
+
padding_amount = target_length - original_length
|
175
|
+
paddings = [[0, 0], [0, padding_amount], [0, 0]]
|
176
|
+
if use_tf_graph_ops and keras.config.backend() != "tensorflow":
|
177
|
+
return tf.pad(
|
178
|
+
processed_inputs,
|
179
|
+
paddings,
|
180
|
+
mode="CONSTANT",
|
181
|
+
constant_values=float(self.padding_value),
|
182
|
+
)
|
183
|
+
else:
|
184
|
+
return keras.ops.pad(
|
185
|
+
processed_inputs,
|
186
|
+
paddings,
|
187
|
+
mode="constant",
|
188
|
+
constant_values=self.padding_value,
|
189
|
+
)
|
190
|
+
|
191
|
+
if use_tf_graph_ops and keras.config.backend() != "torch":
|
192
|
+
processed_inputs = tf.cond(
|
193
|
+
needs_padding, pad_fn, lambda: processed_inputs
|
194
|
+
)
|
195
|
+
else:
|
196
|
+
processed_inputs = keras.ops.cond(
|
197
|
+
needs_padding, pad_fn, lambda: processed_inputs
|
198
|
+
)
|
199
|
+
|
200
|
+
elif padding == "max_length" and max_length is not None:
|
201
|
+
target_length_const = max_length
|
202
|
+
if pad_to_multiple_of:
|
203
|
+
target_length_const = (
|
204
|
+
(target_length_const + pad_to_multiple_of - 1)
|
205
|
+
// pad_to_multiple_of
|
206
|
+
) * pad_to_multiple_of
|
207
|
+
|
208
|
+
needs_padding = keras.ops.less(original_length, target_length_const)
|
209
|
+
needs_truncating = keras.ops.greater(
|
210
|
+
original_length, target_length_const
|
211
|
+
)
|
212
|
+
|
213
|
+
def pad_fn():
|
214
|
+
padding_amount = target_length_const - original_length
|
215
|
+
paddings = [[0, 0], [0, padding_amount], [0, 0]]
|
216
|
+
if use_tf_graph_ops and keras.config.backend() != "tensorflow":
|
217
|
+
return tf.pad(
|
218
|
+
processed_inputs,
|
219
|
+
paddings,
|
220
|
+
mode="CONSTANT",
|
221
|
+
constant_values=float(self.padding_value),
|
222
|
+
)
|
223
|
+
else:
|
224
|
+
return keras.ops.pad(
|
225
|
+
processed_inputs,
|
226
|
+
paddings,
|
227
|
+
mode="constant",
|
228
|
+
constant_values=self.padding_value,
|
229
|
+
)
|
230
|
+
|
231
|
+
def trunc_fn():
|
232
|
+
if use_tf_graph_ops and keras.config.backend() != "tensorflow":
|
233
|
+
return processed_inputs[:, :target_length_const, :]
|
234
|
+
else:
|
235
|
+
return keras.ops.slice(
|
236
|
+
processed_inputs,
|
237
|
+
[0, 0, 0],
|
238
|
+
[-1, target_length_const, -1],
|
239
|
+
)
|
240
|
+
|
241
|
+
if use_tf_graph_ops and keras.config.backend() != "torch":
|
242
|
+
processed_inputs = tf.cond(
|
243
|
+
needs_padding,
|
244
|
+
pad_fn,
|
245
|
+
lambda: tf.cond(
|
246
|
+
needs_truncating, trunc_fn, lambda: processed_inputs
|
247
|
+
),
|
248
|
+
)
|
249
|
+
else:
|
250
|
+
needs_padding = keras.ops.less(
|
251
|
+
original_length, target_length_const
|
252
|
+
)
|
253
|
+
needs_truncating = keras.ops.greater(
|
254
|
+
original_length, target_length_const
|
255
|
+
)
|
256
|
+
needs_padding_bool = keras.ops.convert_to_numpy(needs_padding)
|
257
|
+
needs_truncating_bool = keras.ops.convert_to_numpy(
|
258
|
+
needs_truncating
|
259
|
+
)
|
260
|
+
|
261
|
+
if needs_padding_bool:
|
262
|
+
padding_amount = target_length_const - original_length
|
263
|
+
paddings = [[0, 0], [0, padding_amount], [0, 0]]
|
264
|
+
processed_inputs = keras.ops.pad(
|
265
|
+
processed_inputs,
|
266
|
+
paddings,
|
267
|
+
mode="constant",
|
268
|
+
constant_values=self.padding_value,
|
269
|
+
)
|
270
|
+
elif needs_truncating_bool:
|
271
|
+
processed_inputs = processed_inputs[
|
272
|
+
:, :target_length_const, :
|
273
|
+
]
|
274
|
+
|
275
|
+
# Normalize if enabled.
|
276
|
+
if self.do_normalize:
|
277
|
+
mean = keras.ops.mean(processed_inputs, axis=1, keepdims=True)
|
278
|
+
var = keras.ops.var(processed_inputs, axis=1, keepdims=True)
|
279
|
+
processed_inputs = (processed_inputs - mean) / keras.ops.sqrt(
|
280
|
+
var + 1e-7
|
281
|
+
)
|
282
|
+
|
283
|
+
return processed_inputs
|
284
|
+
|
285
|
+
def compute_output_shape(self, input_shape):
|
286
|
+
# [batch_size, time_steps] → [batch_size, time_steps, 1].
|
287
|
+
if len(input_shape) == 2 or len(input_shape) == 3:
|
288
|
+
return (input_shape[0], None, 1)
|
289
|
+
else:
|
290
|
+
raise ValueError("Input shape must be rank 2 or 3.")
|
291
|
+
|
292
|
+
def get_config(self):
|
293
|
+
config = super().get_config()
|
294
|
+
config.update(
|
295
|
+
{
|
296
|
+
"sampling_rate": self.sampling_rate,
|
297
|
+
"padding_value": self.padding_value,
|
298
|
+
"do_normalize": self.do_normalize,
|
299
|
+
}
|
300
|
+
)
|
301
|
+
return config
|