keras-hub-nightly 0.21.0.dev202505140407__py3-none-any.whl → 0.21.0.dev202505150407__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -93,6 +93,9 @@ from keras_hub.src.models.mit.mit_image_converter import (
93
93
  from keras_hub.src.models.mobilenet.mobilenet_image_converter import (
94
94
  MobileNetImageConverter as MobileNetImageConverter,
95
95
  )
96
+ from keras_hub.src.models.moonshine.moonshine_audio_converter import (
97
+ MoonshineAudioConverter as MoonshineAudioConverter,
98
+ )
96
99
  from keras_hub.src.models.pali_gemma.pali_gemma_image_converter import (
97
100
  PaliGemmaImageConverter as PaliGemmaImageConverter,
98
101
  )
@@ -369,6 +369,18 @@ from keras_hub.src.models.mobilenet.mobilenet_image_classifier import (
369
369
  from keras_hub.src.models.mobilenet.mobilenet_image_classifier_preprocessor import (
370
370
  MobileNetImageClassifierPreprocessor as MobileNetImageClassifierPreprocessor,
371
371
  )
372
+ from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
373
+ MoonshineAudioToText as MoonshineAudioToText,
374
+ )
375
+ from keras_hub.src.models.moonshine.moonshine_audio_to_text_preprocessor import (
376
+ MoonshineAudioToTextPreprocessor as MoonshineAudioToTextPreprocessor,
377
+ )
378
+ from keras_hub.src.models.moonshine.moonshine_backbone import (
379
+ MoonshineBackbone as MoonshineBackbone,
380
+ )
381
+ from keras_hub.src.models.moonshine.moonshine_tokenizer import (
382
+ MoonshineTokenizer as MoonshineTokenizer,
383
+ )
372
384
  from keras_hub.src.models.object_detector import (
373
385
  ObjectDetector as ImageObjectDetector,
374
386
  )
File without changes
@@ -0,0 +1,301 @@
1
+ import keras
2
+
3
+ try:
4
+ import tensorflow as tf
5
+ except ImportError:
6
+ tf = None
7
+
8
+ from keras_hub.src.api_export import keras_hub_export
9
+ from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
10
+ from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
11
+
12
+
13
+ @keras_hub_export("keras_hub.layers.MoonshineAudioConverter")
14
+ class MoonshineAudioConverter(AudioConverter):
15
+ """Moonshine audio preprocessing layer.
16
+
17
+ This layer processes raw audio waveforms for the Moonshine ASR model. Audio
18
+ is formatted as a batched tensor at a 16kHz sample rate and validated for
19
+ length (0.1 to 64 seconds). The layer handles padding and optional
20
+ normalization. It does not contain trainable weights.
21
+
22
+ Args:
23
+ sampling_rate: int, optional. The audio sampling rate in Hz. Defaults to
24
+ 16,000.
25
+ padding_value: float, optional. The value for padding. Defaults to 0.0.
26
+ do_normalize: bool, optional. Whether to normalize inputs. Defaults to
27
+ False.
28
+ **kwargs: Additional keyword arguments passed to the base AudioConverter
29
+ class for customizing the underlying preprocessing behavior.
30
+
31
+ Call arguments:
32
+ - `inputs`: The raw audio data to be processed. It should be a tensor of
33
+ shape `(batch_size, time_steps, 1)` for mono audio. If the input has
34
+ shape `(batch_size, time_steps)`, the layer will add the channel
35
+ dimension.
36
+ - `sampling_rate`: The sampling rate of the audio in Hz. If
37
+ provided, it must match the expected sampling rate set during
38
+ initialization (default is 16,000 Hz). If not provided, the expected
39
+ sampling rate is taken from the initialization arguments.
40
+ - `padding`: The padding strategy to apply. If provided, can be one of:
41
+ - `"longest"`: If `pad_to_multiple_of` is set, pads the audio to
42
+ make the time_steps dimension a multiple of `pad_to_multiple_of`.
43
+ - `"max_length"`: Pads or truncates the audio to `max_length` time
44
+ steps. If `pad_to_multiple_of` is set, the target length will be
45
+ the smallest multiple of `pad_to_multiple_of` that is greater than
46
+ or equal to `max_length`.
47
+ - If not specified or `None`, no padding is applied.
48
+ - `max_length`: The target number of time steps when `padding` is
49
+ `"max_length"`. If not provided and `padding` is `"max_length"`, no
50
+ padding or truncation is applied.
51
+ - `pad_to_multiple_of`: If set, the padded time_steps will be a
52
+ multiple of this value for the chosen padding strategy.
53
+
54
+ Examples:
55
+ ```python
56
+ import keras
57
+ from keras_hub.layers import MoonshineAudioConverter
58
+
59
+ # Create a dummy audio input (1 second at 16kHz).
60
+ dummy_audio = keras.ops.convert_to_tensor(
61
+ [[0.1] * 16000],
62
+ dtype="float32"
63
+ )
64
+ dummy_audio = keras.ops.expand_dims(dummy_audio, axis=-1)
65
+
66
+ # Initialize the preprocessor.
67
+ preprocessor = MoonshineAudioConverter(do_normalize=True)
68
+
69
+ # Process the audio.
70
+ processed_audio = preprocessor(dummy_audio)
71
+
72
+ # Output shape.
73
+ print(processed_audio.shape) # Expected: (1, 16000, 1) or padded length
74
+ ```
75
+ """
76
+
77
+ # References:
78
+ # Defined and formulated based on the UsefulSensors implementation of audio
79
+ # preprocessing logic (https://github.com/usefulsensors/moonshine/blob/main/moonshine/transcribe.py).
80
+
81
+ backbone_cls = MoonshineBackbone
82
+
83
+ def __init__(
84
+ self,
85
+ sampling_rate=16000,
86
+ padding_value=0.0,
87
+ do_normalize=False,
88
+ **kwargs,
89
+ ):
90
+ super().__init__(**kwargs)
91
+ self._convert_input_args = False
92
+ self._allow_non_tensor_positional_args = True
93
+ self.sampling_rate = sampling_rate
94
+ self.padding_value = padding_value
95
+ self.do_normalize = do_normalize
96
+
97
+ def call(
98
+ self,
99
+ inputs,
100
+ sampling_rate=None,
101
+ padding=None,
102
+ max_length=None,
103
+ pad_to_multiple_of=None,
104
+ ):
105
+ # Validate sampling rate.
106
+ if sampling_rate is not None and sampling_rate != self.sampling_rate:
107
+ raise ValueError(
108
+ f"Expected sampling_rate {self.sampling_rate}, got "
109
+ f"{sampling_rate}"
110
+ )
111
+
112
+ # Ensure inputs are (batch_size, time_steps, 1).
113
+ input_shape = keras.ops.shape(inputs)
114
+ input_rank = len(input_shape)
115
+ if input_rank == 2:
116
+ processed_inputs = keras.ops.expand_dims(inputs, axis=-1)
117
+ elif input_rank == 3:
118
+ processed_inputs = inputs
119
+ else:
120
+ raise ValueError(
121
+ "Inputs must be mono audio: (batch_size, time_steps, 1)"
122
+ )
123
+
124
+ # Get original length and validate duration.
125
+ current_shape = keras.ops.shape(processed_inputs)
126
+ original_length = current_shape[1]
127
+ duration = (
128
+ keras.ops.cast(original_length, keras.backend.floatx())
129
+ / self.sampling_rate
130
+ )
131
+ # Source: https://github.com/usefulsensors/moonshine/blob/4a000427bd36a1c2c6d20a86c672dbd850b44c88/moonshine/transcribe.py#L20
132
+ is_invalid_duration = keras.ops.logical_or(
133
+ keras.ops.less(duration, 0.1), keras.ops.greater(duration, 64.0)
134
+ )
135
+
136
+ def print_warning_fn():
137
+ import warnings
138
+
139
+ warnings.warn(
140
+ "Audio duration must be between 0.1 and 64 seconds. For "
141
+ "transcribing longer segments, pre-segment your audio and "
142
+ "provide shorter segments."
143
+ )
144
+ return keras.ops.convert_to_tensor(True, dtype="bool")
145
+
146
+ is_tf_symbolic = (
147
+ tf is not None
148
+ and hasattr(processed_inputs, "graph")
149
+ and hasattr(processed_inputs.graph, "as_graph_def")
150
+ )
151
+ use_tf_graph_ops = tf is not None and is_tf_symbolic
152
+ if use_tf_graph_ops and keras.config.backend() != "torch":
153
+ _ = tf.cond(
154
+ is_invalid_duration,
155
+ print_warning_fn,
156
+ lambda: keras.ops.convert_to_tensor(False, dtype="bool"),
157
+ )
158
+ else:
159
+ if keras.ops.convert_to_numpy(is_invalid_duration):
160
+ print_warning_fn()
161
+
162
+ # Handle padding.
163
+ if padding == "longest":
164
+ target_length = original_length
165
+ if pad_to_multiple_of:
166
+ target_length = (
167
+ (target_length + pad_to_multiple_of - 1)
168
+ // pad_to_multiple_of
169
+ ) * pad_to_multiple_of
170
+
171
+ needs_padding = keras.ops.greater(target_length, original_length)
172
+
173
+ def pad_fn():
174
+ padding_amount = target_length - original_length
175
+ paddings = [[0, 0], [0, padding_amount], [0, 0]]
176
+ if use_tf_graph_ops and keras.config.backend() != "tensorflow":
177
+ return tf.pad(
178
+ processed_inputs,
179
+ paddings,
180
+ mode="CONSTANT",
181
+ constant_values=float(self.padding_value),
182
+ )
183
+ else:
184
+ return keras.ops.pad(
185
+ processed_inputs,
186
+ paddings,
187
+ mode="constant",
188
+ constant_values=self.padding_value,
189
+ )
190
+
191
+ if use_tf_graph_ops and keras.config.backend() != "torch":
192
+ processed_inputs = tf.cond(
193
+ needs_padding, pad_fn, lambda: processed_inputs
194
+ )
195
+ else:
196
+ processed_inputs = keras.ops.cond(
197
+ needs_padding, pad_fn, lambda: processed_inputs
198
+ )
199
+
200
+ elif padding == "max_length" and max_length is not None:
201
+ target_length_const = max_length
202
+ if pad_to_multiple_of:
203
+ target_length_const = (
204
+ (target_length_const + pad_to_multiple_of - 1)
205
+ // pad_to_multiple_of
206
+ ) * pad_to_multiple_of
207
+
208
+ needs_padding = keras.ops.less(original_length, target_length_const)
209
+ needs_truncating = keras.ops.greater(
210
+ original_length, target_length_const
211
+ )
212
+
213
+ def pad_fn():
214
+ padding_amount = target_length_const - original_length
215
+ paddings = [[0, 0], [0, padding_amount], [0, 0]]
216
+ if use_tf_graph_ops and keras.config.backend() != "tensorflow":
217
+ return tf.pad(
218
+ processed_inputs,
219
+ paddings,
220
+ mode="CONSTANT",
221
+ constant_values=float(self.padding_value),
222
+ )
223
+ else:
224
+ return keras.ops.pad(
225
+ processed_inputs,
226
+ paddings,
227
+ mode="constant",
228
+ constant_values=self.padding_value,
229
+ )
230
+
231
+ def trunc_fn():
232
+ if use_tf_graph_ops and keras.config.backend() != "tensorflow":
233
+ return processed_inputs[:, :target_length_const, :]
234
+ else:
235
+ return keras.ops.slice(
236
+ processed_inputs,
237
+ [0, 0, 0],
238
+ [-1, target_length_const, -1],
239
+ )
240
+
241
+ if use_tf_graph_ops and keras.config.backend() != "torch":
242
+ processed_inputs = tf.cond(
243
+ needs_padding,
244
+ pad_fn,
245
+ lambda: tf.cond(
246
+ needs_truncating, trunc_fn, lambda: processed_inputs
247
+ ),
248
+ )
249
+ else:
250
+ needs_padding = keras.ops.less(
251
+ original_length, target_length_const
252
+ )
253
+ needs_truncating = keras.ops.greater(
254
+ original_length, target_length_const
255
+ )
256
+ needs_padding_bool = keras.ops.convert_to_numpy(needs_padding)
257
+ needs_truncating_bool = keras.ops.convert_to_numpy(
258
+ needs_truncating
259
+ )
260
+
261
+ if needs_padding_bool:
262
+ padding_amount = target_length_const - original_length
263
+ paddings = [[0, 0], [0, padding_amount], [0, 0]]
264
+ processed_inputs = keras.ops.pad(
265
+ processed_inputs,
266
+ paddings,
267
+ mode="constant",
268
+ constant_values=self.padding_value,
269
+ )
270
+ elif needs_truncating_bool:
271
+ processed_inputs = processed_inputs[
272
+ :, :target_length_const, :
273
+ ]
274
+
275
+ # Normalize if enabled.
276
+ if self.do_normalize:
277
+ mean = keras.ops.mean(processed_inputs, axis=1, keepdims=True)
278
+ var = keras.ops.var(processed_inputs, axis=1, keepdims=True)
279
+ processed_inputs = (processed_inputs - mean) / keras.ops.sqrt(
280
+ var + 1e-7
281
+ )
282
+
283
+ return processed_inputs
284
+
285
+ def compute_output_shape(self, input_shape):
286
+ # [batch_size, time_steps] → [batch_size, time_steps, 1].
287
+ if len(input_shape) == 2 or len(input_shape) == 3:
288
+ return (input_shape[0], None, 1)
289
+ else:
290
+ raise ValueError("Input shape must be rank 2 or 3.")
291
+
292
+ def get_config(self):
293
+ config = super().get_config()
294
+ config.update(
295
+ {
296
+ "sampling_rate": self.sampling_rate,
297
+ "padding_value": self.padding_value,
298
+ "do_normalize": self.do_normalize,
299
+ }
300
+ )
301
+ return config