keras-hub-nightly 0.21.0.dev202505230409__py3-none-any.whl → 0.21.0.dev202505240409__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/src/models/audio_to_text.py +66 -0
- keras_hub/src/models/audio_to_text_preprocessor.py +80 -0
- keras_hub/src/models/moonshine/__init__.py +5 -0
- keras_hub/src/models/moonshine/moonshine_audio_to_text.py +2 -2
- keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py +4 -2
- keras_hub/src/version.py +1 -1
- {keras_hub_nightly-0.21.0.dev202505230409.dist-info → keras_hub_nightly-0.21.0.dev202505240409.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.21.0.dev202505230409.dist-info → keras_hub_nightly-0.21.0.dev202505240409.dist-info}/RECORD +10 -8
- {keras_hub_nightly-0.21.0.dev202505230409.dist-info → keras_hub_nightly-0.21.0.dev202505240409.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.21.0.dev202505230409.dist-info → keras_hub_nightly-0.21.0.dev202505240409.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
|
2
|
+
|
3
|
+
|
4
|
+
class AudioToText(Seq2SeqLM):
|
5
|
+
"""Base class for audio-to-text models.
|
6
|
+
|
7
|
+
`AudioToText` tasks wrap a `keras_hub.models.Backbone` (capable of
|
8
|
+
processing audio and text features) and a
|
9
|
+
`keras_hub.models.AudioToTextPreprocessor` to create a model for
|
10
|
+
audio-to-text tasks like speech recognition or audio transcription.
|
11
|
+
|
12
|
+
These models typically consist of an encoder that processes audio input
|
13
|
+
and a decoder that generates a textual representation.
|
14
|
+
|
15
|
+
`AudioToText` tasks provide a high-level `generate()` method for
|
16
|
+
auto-regressively generating text from audio input. An optional text
|
17
|
+
prompt can also be provided to the decoder to guide generation. The
|
18
|
+
sampling strategy for generation (e.g., greedy, top-k, top-p) can be
|
19
|
+
controlled via the `sampler` argument in the `compile()` method.
|
20
|
+
|
21
|
+
When calling `fit()`, inputs should consist of audio data and corresponding
|
22
|
+
target text transcriptions. The model is trained to predict the target text
|
23
|
+
token-by-token.
|
24
|
+
|
25
|
+
All `AudioToText` tasks include a `from_preset()` constructor which
|
26
|
+
can be used to load pre-trained configurations and weights for specific
|
27
|
+
audio-to-text models.
|
28
|
+
This constructor can also be called on the base `AudioToText` class,
|
29
|
+
which will automatically select the correct subclass based on the preset.
|
30
|
+
|
31
|
+
Examples:
|
32
|
+
```python
|
33
|
+
# Load a Moonshine backbone with pre-trained weights.
|
34
|
+
# AudioToText is a base class. You will typically work with a specific
|
35
|
+
# implementation, such as `keras_hub.models.MoonshineAudioToText`.
|
36
|
+
# The following examples demonstrate common usage patterns.
|
37
|
+
|
38
|
+
# Initialize a model from a preset using the specific subclass.
|
39
|
+
audio_to_text = keras_hub.models.MoonshineAudioToText.from_preset(
|
40
|
+
"moonshine_base_en"
|
41
|
+
)
|
42
|
+
|
43
|
+
# Initialize a model from a preset using the base class.
|
44
|
+
audio_to_text_model_base = keras_hub.models.AudioToText.from_preset(
|
45
|
+
"moonshine_base_en"
|
46
|
+
)
|
47
|
+
|
48
|
+
# Generate text from an audio input.
|
49
|
+
audio_input_tensor = keras.random.normal((1, 16000, 1))
|
50
|
+
generated_output = audio_to_text_model.generate(
|
51
|
+
{"audio": audio_input_tensor}
|
52
|
+
)
|
53
|
+
|
54
|
+
# Generate conditioned on the `"The quick brown fox."` as an input sequence.
|
55
|
+
prompted_output = audio_to_text_model.generate(
|
56
|
+
{"audio": audio_input_tensor, "text": "The quick brown fox."}
|
57
|
+
)
|
58
|
+
|
59
|
+
# Use a different sampling strategy for generation.
|
60
|
+
audio_to_text_model.compile(sampler="greedy")
|
61
|
+
greedy_output = audio_to_text_model.generate(
|
62
|
+
{"audio": audio_input_tensor}
|
63
|
+
)
|
64
|
+
"""
|
65
|
+
|
66
|
+
# TODO: Fill in once audio to text task model requirements are clearer.
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
|
2
|
+
|
3
|
+
|
4
|
+
class AudioToTextPreprocessor(Seq2SeqLMPreprocessor):
|
5
|
+
"""Base class for audio-to-text preprocessing layers.
|
6
|
+
|
7
|
+
`AudioToTextPreprocessor` layers wrap an audio feature extractor (specific
|
8
|
+
to the subclass) and a `keras_hub.tokenizer.Tokenizer` to create a
|
9
|
+
preprocessing layer for audio-to-text tasks. It is intended to be
|
10
|
+
paired with a `keras_hub.models.AudioToText` task.
|
11
|
+
|
12
|
+
Subclasses are expected to handle the conversion of raw audio data into
|
13
|
+
numerical features suitable for an encoder, and raw text data into token IDs
|
14
|
+
for a decoder.
|
15
|
+
|
16
|
+
All `AudioToTextPreprocessor` layers take a dictionary as input,
|
17
|
+
typically with keys like `"audio"` (for audio data) and `"text"` (for
|
18
|
+
target transcriptions or decoder prompts).
|
19
|
+
|
20
|
+
This layer will always output a `(x, y, sample_weight)` tuple, where `x`
|
21
|
+
is a dictionary containing processed audio features for the encoder and
|
22
|
+
tokenized text inputs for the decoder. `y` contains the target token IDs
|
23
|
+
(decoder input tokens shifted by one position), and `sample_weight`
|
24
|
+
indicates padding in `y`. The exact keys and structure of features within
|
25
|
+
`x` will depend on the specific subclass and the paired `AudioToText` model.
|
26
|
+
|
27
|
+
An `AudioToTextPreprocessor` includes `generate_preprocess` and
|
28
|
+
`generate_postprocess` methods for use during inference with an
|
29
|
+
`AudioToText` model's `generate()` method.
|
30
|
+
|
31
|
+
All `AudioToTextPreprocessor` tasks include a `from_preset()` constructor
|
32
|
+
which can be used to load a pre-trained configuration, including tokenizer
|
33
|
+
vocabularies and audio feature extraction settings. Calling `from_preset()`
|
34
|
+
on this base class can instantiate the correct subclass registered for the
|
35
|
+
given preset.
|
36
|
+
|
37
|
+
Examples:
|
38
|
+
```python
|
39
|
+
preprocessor = keras_hub.models.AudioToTextPreprocessor.from_preset(
|
40
|
+
"moonshine_base_en",
|
41
|
+
decoder_sequence_length=10
|
42
|
+
)
|
43
|
+
|
44
|
+
# Process a single audio-text pair.
|
45
|
+
x = {
|
46
|
+
"audio": keras.random.normal((1, 16000, 1)),
|
47
|
+
"text": ["the quick brown fox"]
|
48
|
+
}
|
49
|
+
x, y, sample_weight = preprocessor(x)
|
50
|
+
|
51
|
+
# Process a batch of audio-text pairs.
|
52
|
+
x = {
|
53
|
+
"audio": keras.random.normal((2, 16000, 1)),
|
54
|
+
"text": ["first sentence", "second sentence"]
|
55
|
+
}
|
56
|
+
x, y, sample_weight = preprocessor(x)
|
57
|
+
|
58
|
+
# With a `tf.data.Dataset`.
|
59
|
+
audio_tf = keras.ops.convert_to_tensor(batch_input["audio"])
|
60
|
+
text_tf = batch_input["text"] # List of strings
|
61
|
+
x = {"audio": audio_tf, "text": text_tf}
|
62
|
+
ds = tf.data.Dataset.from_tensor_slices(x)
|
63
|
+
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
|
64
|
+
ds = ds.batch(2) # Batching after map
|
65
|
+
|
66
|
+
# Generate preprocess and postprocess.
|
67
|
+
x = preprocessor.generate_preprocess({
|
68
|
+
"audio": keras.random.normal((1, 16000, 1)),
|
69
|
+
"text": ["optional prompt text"]
|
70
|
+
})
|
71
|
+
x = preprocessor.generate_postprocess({
|
72
|
+
"decoder_token_ids": keras.ops.array([[10, 20, 30, 2, 0]]),
|
73
|
+
"decoder_padding_mask": keras.ops.array([
|
74
|
+
[True, True, True, True, False]
|
75
|
+
])
|
76
|
+
})
|
77
|
+
```
|
78
|
+
"""
|
79
|
+
|
80
|
+
# TODO: Fill in once audio to text task model requirements are clearer.
|
@@ -0,0 +1,5 @@
|
|
1
|
+
from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
|
2
|
+
from keras_hub.src.models.moonshine.moonshine_presets import backbone_presets
|
3
|
+
from keras_hub.src.utils.preset_utils import register_presets
|
4
|
+
|
5
|
+
register_presets(backbone_presets, MoonshineBackbone)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import keras
|
2
2
|
|
3
3
|
from keras_hub.src.api_export import keras_hub_export
|
4
|
+
from keras_hub.src.models.audio_to_text import AudioToText
|
4
5
|
from keras_hub.src.models.moonshine.moonshine_audio_to_text_preprocessor import ( # noqa: E501
|
5
6
|
MoonshineAudioToTextPreprocessor,
|
6
7
|
)
|
@@ -9,12 +10,11 @@ from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
|
|
9
10
|
from keras_hub.src.models.moonshine.moonshine_backbone import (
|
10
11
|
compute_output_lengths,
|
11
12
|
)
|
12
|
-
from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
|
13
13
|
from keras_hub.src.utils.tensor_utils import any_equal
|
14
14
|
|
15
15
|
|
16
16
|
@keras_hub_export("keras_hub.models.MoonshineAudioToText")
|
17
|
-
class MoonshineAudioToText(
|
17
|
+
class MoonshineAudioToText(AudioToText):
|
18
18
|
"""An end-to-end Moonshine model for audio-to-text tasks.
|
19
19
|
|
20
20
|
A Seq2Seq LM designed for audio-to-text tasks, such as speech recognition.
|
@@ -6,16 +6,18 @@ except ImportError:
|
|
6
6
|
tf = None
|
7
7
|
from keras_hub.src.api_export import keras_hub_export
|
8
8
|
from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
|
9
|
+
from keras_hub.src.models.audio_to_text_preprocessor import (
|
10
|
+
AudioToTextPreprocessor,
|
11
|
+
)
|
9
12
|
from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
|
10
13
|
from keras_hub.src.models.moonshine.moonshine_tokenizer import (
|
11
14
|
MoonshineTokenizer,
|
12
15
|
)
|
13
|
-
from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
|
14
16
|
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
15
17
|
|
16
18
|
|
17
19
|
@keras_hub_export("keras_hub.models.MoonshineAudioToTextPreprocessor")
|
18
|
-
class MoonshineAudioToTextPreprocessor(
|
20
|
+
class MoonshineAudioToTextPreprocessor(AudioToTextPreprocessor):
|
19
21
|
"""Moonshine Seq2Seq LM preprocessor for audio-to-text tasks.
|
20
22
|
|
21
23
|
This preprocessor converts raw audio and text inputs into a format suitable
|
keras_hub/src/version.py
CHANGED
@@ -5,7 +5,7 @@ keras_hub/models/__init__.py,sha256=itSzodVUeuX6HQnmsSXY0Wv-5Htbu397410R-SFW_4I,
|
|
5
5
|
keras_hub/samplers/__init__.py,sha256=aFQIkiqbZpi8vjrPp2MVII4QUfE-eQjra5fMeHsoy7k,886
|
6
6
|
keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
|
8
|
-
keras_hub/src/version.py,sha256=
|
8
|
+
keras_hub/src/version.py,sha256=AnU8tBqSqSoLY34F6O-fFt47PXgrGHUbqORa6_sXy6w,222
|
9
9
|
keras_hub/src/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
keras_hub/src/layers/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
keras_hub/src/layers/modeling/alibi_bias.py,sha256=1XBTHI52L_iJDhN_w5ydu_iMhCuTgQAxEPwcLA6BPuk,4411
|
@@ -41,6 +41,8 @@ keras_hub/src/metrics/rouge_base.py,sha256=Pt2DUznhTTeR-fX1nQ_wSbPtmuTgxQTvrGpu8
|
|
41
41
|
keras_hub/src/metrics/rouge_l.py,sha256=JlZhMBV6wS_6zMd57pkTc6yxHkEJT9fVQMlPZKekQzQ,2729
|
42
42
|
keras_hub/src/metrics/rouge_n.py,sha256=JoFtmgjF4Ic263ny6bfD6vMHKreH9le3HnOOxemupRc,3620
|
43
43
|
keras_hub/src/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
|
+
keras_hub/src/models/audio_to_text.py,sha256=XoOjXtKBX6K1fz-zOXcdVo3FpjuxCMnJZh2LQcYXb_0,2726
|
45
|
+
keras_hub/src/models/audio_to_text_preprocessor.py,sha256=GS-WWyJ6aSsPRxi_0bxvxA00h2mT2FEwSdAoQXAUYVI,3249
|
44
46
|
keras_hub/src/models/backbone.py,sha256=KS2x3HFWKhEYhroUFT3uZgSkeW_48zPGqUNvxCDDIQQ,11534
|
45
47
|
keras_hub/src/models/causal_lm.py,sha256=ReaF-i3SHsCkHh4c28jM72QjMQ8x7yiCwG39FRb-7KE,16786
|
46
48
|
keras_hub/src/models/causal_lm_preprocessor.py,sha256=YY7VJZicdmnjDSWi9g4_pEpd5bdJK166GlWcapvokF0,6663
|
@@ -265,10 +267,10 @@ keras_hub/src/models/mobilenet/mobilenet_image_classifier_preprocessor.py,sha256
|
|
265
267
|
keras_hub/src/models/mobilenet/mobilenet_image_converter.py,sha256=a3Ka0UYYK5wHSOjf2oMHSgofRazTAeUfttklVefq14w,360
|
266
268
|
keras_hub/src/models/mobilenet/mobilenet_presets.py,sha256=--nhaM6LmaiCtQlZPDwoQTHW7ciU0igzS4f9ssdD9Lo,1903
|
267
269
|
keras_hub/src/models/mobilenet/util.py,sha256=S7j4UacmVIJ3fU8cymyAoK49eHcpWIKTOyUQiEjcbzQ,721
|
268
|
-
keras_hub/src/models/moonshine/__init__.py,sha256=
|
270
|
+
keras_hub/src/models/moonshine/__init__.py,sha256=WK_9Cy1dp5KplNAaTsaJbd-2DGLsiHQsIL5ZnXuCbDQ,275
|
269
271
|
keras_hub/src/models/moonshine/moonshine_audio_converter.py,sha256=FnvR7SP44uVOsA3g9azUhQjsVg809eJ5nqoJZQ-DAq0,11854
|
270
|
-
keras_hub/src/models/moonshine/moonshine_audio_to_text.py,sha256=
|
271
|
-
keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py,sha256=
|
272
|
+
keras_hub/src/models/moonshine/moonshine_audio_to_text.py,sha256=dXFtjaxL1jpcIAiiZY1-kcNL-S4RiRJiAC2uR_a3Fyc,15865
|
273
|
+
keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py,sha256=hTw941ww8cJrP5DRrxv2DtZUNLJ9A3cayFhnsG5Ef4g,10016
|
272
274
|
keras_hub/src/models/moonshine/moonshine_backbone.py,sha256=XtRUBe_VusXsFRk7-t1JNXM0lxp2UBOJk9v7gfTNDhA,19623
|
273
275
|
keras_hub/src/models/moonshine/moonshine_decoder.py,sha256=Exf5Gg1gsCBST53wxOgBetKkhjS8E8QIUIlUwHlOkIY,11816
|
274
276
|
keras_hub/src/models/moonshine/moonshine_encoder.py,sha256=NjjMO_FEBlWFSv6Appv8a3V7XovW2afvxxjXwQRgV60,8148
|
@@ -499,7 +501,7 @@ keras_hub/src/utils/transformers/preset_loader.py,sha256=1nfS5xVsl-JROGXJXltTqV1
|
|
499
501
|
keras_hub/src/utils/transformers/safetensor_utils.py,sha256=CYUHyA4y-B61r7NDnCsFb4t_UmSwZ1k9L-8gzEd6KRg,3339
|
500
502
|
keras_hub/tokenizers/__init__.py,sha256=uMjjm0mzUkRb0e4Ac_JK8aJ9cKGUi5UqmzWoWAFJprE,4164
|
501
503
|
keras_hub/utils/__init__.py,sha256=jXPqVGBpJr_PpYmqD8aDG-fRMlxH-ulqCR2SZMn288Y,646
|
502
|
-
keras_hub_nightly-0.21.0.
|
503
|
-
keras_hub_nightly-0.21.0.
|
504
|
-
keras_hub_nightly-0.21.0.
|
505
|
-
keras_hub_nightly-0.21.0.
|
504
|
+
keras_hub_nightly-0.21.0.dev202505240409.dist-info/METADATA,sha256=BJHRD68RtZc8CA6kIFWZxphjYr6g2t62j1FvwLar_LU,7393
|
505
|
+
keras_hub_nightly-0.21.0.dev202505240409.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
506
|
+
keras_hub_nightly-0.21.0.dev202505240409.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
|
507
|
+
keras_hub_nightly-0.21.0.dev202505240409.dist-info/RECORD,,
|
File without changes
|