keras-hub-nightly 0.21.0.dev202505230409__py3-none-any.whl → 0.21.0.dev202505240409__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
2
+
3
+
4
+ class AudioToText(Seq2SeqLM):
5
+ """Base class for audio-to-text models.
6
+
7
+ `AudioToText` tasks wrap a `keras_hub.models.Backbone` (capable of
8
+ processing audio and text features) and a
9
+ `keras_hub.models.AudioToTextPreprocessor` to create a model for
10
+ audio-to-text tasks like speech recognition or audio transcription.
11
+
12
+ These models typically consist of an encoder that processes audio input
13
+ and a decoder that generates a textual representation.
14
+
15
+ `AudioToText` tasks provide a high-level `generate()` method for
16
+ auto-regressively generating text from audio input. An optional text
17
+ prompt can also be provided to the decoder to guide generation. The
18
+ sampling strategy for generation (e.g., greedy, top-k, top-p) can be
19
+ controlled via the `sampler` argument in the `compile()` method.
20
+
21
+ When calling `fit()`, inputs should consist of audio data and corresponding
22
+ target text transcriptions. The model is trained to predict the target text
23
+ token-by-token.
24
+
25
+ All `AudioToText` tasks include a `from_preset()` constructor which
26
+ can be used to load pre-trained configurations and weights for specific
27
+ audio-to-text models.
28
+ This constructor can also be called on the base `AudioToText` class,
29
+ which will automatically select the correct subclass based on the preset.
30
+
31
+ Examples:
32
+ ```python
33
+ # Load a Moonshine backbone with pre-trained weights.
34
+ # AudioToText is a base class. You will typically work with a specific
35
+ # implementation, such as `keras_hub.models.MoonshineAudioToText`.
36
+ # The following examples demonstrate common usage patterns.
37
+
38
+ # Initialize a model from a preset using the specific subclass.
39
+ audio_to_text = keras_hub.models.MoonshineAudioToText.from_preset(
40
+ "moonshine_base_en"
41
+ )
42
+
43
+ # Initialize a model from a preset using the base class.
44
+ audio_to_text_model_base = keras_hub.models.AudioToText.from_preset(
45
+ "moonshine_base_en"
46
+ )
47
+
48
+ # Generate text from an audio input.
49
+ audio_input_tensor = keras.random.normal((1, 16000, 1))
50
+ generated_output = audio_to_text_model.generate(
51
+ {"audio": audio_input_tensor}
52
+ )
53
+
54
+ # Generate conditioned on the `"The quick brown fox."` as an input sequence.
55
+ prompted_output = audio_to_text_model.generate(
56
+ {"audio": audio_input_tensor, "text": "The quick brown fox."}
57
+ )
58
+
59
+ # Use a different sampling strategy for generation.
60
+ audio_to_text_model.compile(sampler="greedy")
61
+ greedy_output = audio_to_text_model.generate(
62
+ {"audio": audio_input_tensor}
63
+ )
64
+ """
65
+
66
+ # TODO: Fill in once audio to text task model requirements are clearer.
@@ -0,0 +1,80 @@
1
+ from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
2
+
3
+
4
+ class AudioToTextPreprocessor(Seq2SeqLMPreprocessor):
5
+ """Base class for audio-to-text preprocessing layers.
6
+
7
+ `AudioToTextPreprocessor` layers wrap an audio feature extractor (specific
8
+ to the subclass) and a `keras_hub.tokenizer.Tokenizer` to create a
9
+ preprocessing layer for audio-to-text tasks. It is intended to be
10
+ paired with a `keras_hub.models.AudioToText` task.
11
+
12
+ Subclasses are expected to handle the conversion of raw audio data into
13
+ numerical features suitable for an encoder, and raw text data into token IDs
14
+ for a decoder.
15
+
16
+ All `AudioToTextPreprocessor` layers take a dictionary as input,
17
+ typically with keys like `"audio"` (for audio data) and `"text"` (for
18
+ target transcriptions or decoder prompts).
19
+
20
+ This layer will always output a `(x, y, sample_weight)` tuple, where `x`
21
+ is a dictionary containing processed audio features for the encoder and
22
+ tokenized text inputs for the decoder. `y` contains the target token IDs
23
+ (decoder input tokens shifted by one position), and `sample_weight`
24
+ indicates padding in `y`. The exact keys and structure of features within
25
+ `x` will depend on the specific subclass and the paired `AudioToText` model.
26
+
27
+ An `AudioToTextPreprocessor` includes `generate_preprocess` and
28
+ `generate_postprocess` methods for use during inference with an
29
+ `AudioToText` model's `generate()` method.
30
+
31
+ All `AudioToTextPreprocessor` tasks include a `from_preset()` constructor
32
+ which can be used to load a pre-trained configuration, including tokenizer
33
+ vocabularies and audio feature extraction settings. Calling `from_preset()`
34
+ on this base class can instantiate the correct subclass registered for the
35
+ given preset.
36
+
37
+ Examples:
38
+ ```python
39
+ preprocessor = keras_hub.models.AudioToTextPreprocessor.from_preset(
40
+ "moonshine_base_en",
41
+ decoder_sequence_length=10
42
+ )
43
+
44
+ # Process a single audio-text pair.
45
+ x = {
46
+ "audio": keras.random.normal((1, 16000, 1)),
47
+ "text": ["the quick brown fox"]
48
+ }
49
+ x, y, sample_weight = preprocessor(x)
50
+
51
+ # Process a batch of audio-text pairs.
52
+ x = {
53
+ "audio": keras.random.normal((2, 16000, 1)),
54
+ "text": ["first sentence", "second sentence"]
55
+ }
56
+ x, y, sample_weight = preprocessor(x)
57
+
58
+ # With a `tf.data.Dataset`.
59
+ audio_tf = keras.ops.convert_to_tensor(batch_input["audio"])
60
+ text_tf = batch_input["text"] # List of strings
61
+ x = {"audio": audio_tf, "text": text_tf}
62
+ ds = tf.data.Dataset.from_tensor_slices(x)
63
+ ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
64
+ ds = ds.batch(2) # Batching after map
65
+
66
+ # Generate preprocess and postprocess.
67
+ x = preprocessor.generate_preprocess({
68
+ "audio": keras.random.normal((1, 16000, 1)),
69
+ "text": ["optional prompt text"]
70
+ })
71
+ x = preprocessor.generate_postprocess({
72
+ "decoder_token_ids": keras.ops.array([[10, 20, 30, 2, 0]]),
73
+ "decoder_padding_mask": keras.ops.array([
74
+ [True, True, True, True, False]
75
+ ])
76
+ })
77
+ ```
78
+ """
79
+
80
+ # TODO: Fill in once audio to text task model requirements are clearer.
@@ -0,0 +1,5 @@
1
+ from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
2
+ from keras_hub.src.models.moonshine.moonshine_presets import backbone_presets
3
+ from keras_hub.src.utils.preset_utils import register_presets
4
+
5
+ register_presets(backbone_presets, MoonshineBackbone)
@@ -1,6 +1,7 @@
1
1
  import keras
2
2
 
3
3
  from keras_hub.src.api_export import keras_hub_export
4
+ from keras_hub.src.models.audio_to_text import AudioToText
4
5
  from keras_hub.src.models.moonshine.moonshine_audio_to_text_preprocessor import ( # noqa: E501
5
6
  MoonshineAudioToTextPreprocessor,
6
7
  )
@@ -9,12 +10,11 @@ from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
9
10
  from keras_hub.src.models.moonshine.moonshine_backbone import (
10
11
  compute_output_lengths,
11
12
  )
12
- from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
13
13
  from keras_hub.src.utils.tensor_utils import any_equal
14
14
 
15
15
 
16
16
  @keras_hub_export("keras_hub.models.MoonshineAudioToText")
17
- class MoonshineAudioToText(Seq2SeqLM):
17
+ class MoonshineAudioToText(AudioToText):
18
18
  """An end-to-end Moonshine model for audio-to-text tasks.
19
19
 
20
20
  A Seq2Seq LM designed for audio-to-text tasks, such as speech recognition.
@@ -6,16 +6,18 @@ except ImportError:
6
6
  tf = None
7
7
  from keras_hub.src.api_export import keras_hub_export
8
8
  from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
9
+ from keras_hub.src.models.audio_to_text_preprocessor import (
10
+ AudioToTextPreprocessor,
11
+ )
9
12
  from keras_hub.src.models.moonshine.moonshine_backbone import MoonshineBackbone
10
13
  from keras_hub.src.models.moonshine.moonshine_tokenizer import (
11
14
  MoonshineTokenizer,
12
15
  )
13
- from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
14
16
  from keras_hub.src.utils.tensor_utils import preprocessing_function
15
17
 
16
18
 
17
19
  @keras_hub_export("keras_hub.models.MoonshineAudioToTextPreprocessor")
18
- class MoonshineAudioToTextPreprocessor(Seq2SeqLMPreprocessor):
20
+ class MoonshineAudioToTextPreprocessor(AudioToTextPreprocessor):
19
21
  """Moonshine Seq2Seq LM preprocessor for audio-to-text tasks.
20
22
 
21
23
  This preprocessor converts raw audio and text inputs into a format suitable
keras_hub/src/version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
2
 
3
3
  # Unique source of truth for the version number.
4
- __version__ = "0.21.0.dev202505230409"
4
+ __version__ = "0.21.0.dev202505240409"
5
5
 
6
6
 
7
7
  @keras_hub_export("keras_hub.version")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: keras-hub-nightly
3
- Version: 0.21.0.dev202505230409
3
+ Version: 0.21.0.dev202505240409
4
4
  Summary: Pretrained models for Keras.
5
5
  Author-email: Keras team <keras-users@googlegroups.com>
6
6
  License-Expression: Apache-2.0
@@ -5,7 +5,7 @@ keras_hub/models/__init__.py,sha256=itSzodVUeuX6HQnmsSXY0Wv-5Htbu397410R-SFW_4I,
5
5
  keras_hub/samplers/__init__.py,sha256=aFQIkiqbZpi8vjrPp2MVII4QUfE-eQjra5fMeHsoy7k,886
6
6
  keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
8
- keras_hub/src/version.py,sha256=SjWdrHYDbNitBzSsMmxG-HvuuqsSB3ICvTQclkoX-Os,222
8
+ keras_hub/src/version.py,sha256=AnU8tBqSqSoLY34F6O-fFt47PXgrGHUbqORa6_sXy6w,222
9
9
  keras_hub/src/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  keras_hub/src/layers/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  keras_hub/src/layers/modeling/alibi_bias.py,sha256=1XBTHI52L_iJDhN_w5ydu_iMhCuTgQAxEPwcLA6BPuk,4411
@@ -41,6 +41,8 @@ keras_hub/src/metrics/rouge_base.py,sha256=Pt2DUznhTTeR-fX1nQ_wSbPtmuTgxQTvrGpu8
41
41
  keras_hub/src/metrics/rouge_l.py,sha256=JlZhMBV6wS_6zMd57pkTc6yxHkEJT9fVQMlPZKekQzQ,2729
42
42
  keras_hub/src/metrics/rouge_n.py,sha256=JoFtmgjF4Ic263ny6bfD6vMHKreH9le3HnOOxemupRc,3620
43
43
  keras_hub/src/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
+ keras_hub/src/models/audio_to_text.py,sha256=XoOjXtKBX6K1fz-zOXcdVo3FpjuxCMnJZh2LQcYXb_0,2726
45
+ keras_hub/src/models/audio_to_text_preprocessor.py,sha256=GS-WWyJ6aSsPRxi_0bxvxA00h2mT2FEwSdAoQXAUYVI,3249
44
46
  keras_hub/src/models/backbone.py,sha256=KS2x3HFWKhEYhroUFT3uZgSkeW_48zPGqUNvxCDDIQQ,11534
45
47
  keras_hub/src/models/causal_lm.py,sha256=ReaF-i3SHsCkHh4c28jM72QjMQ8x7yiCwG39FRb-7KE,16786
46
48
  keras_hub/src/models/causal_lm_preprocessor.py,sha256=YY7VJZicdmnjDSWi9g4_pEpd5bdJK166GlWcapvokF0,6663
@@ -265,10 +267,10 @@ keras_hub/src/models/mobilenet/mobilenet_image_classifier_preprocessor.py,sha256
265
267
  keras_hub/src/models/mobilenet/mobilenet_image_converter.py,sha256=a3Ka0UYYK5wHSOjf2oMHSgofRazTAeUfttklVefq14w,360
266
268
  keras_hub/src/models/mobilenet/mobilenet_presets.py,sha256=--nhaM6LmaiCtQlZPDwoQTHW7ciU0igzS4f9ssdD9Lo,1903
267
269
  keras_hub/src/models/mobilenet/util.py,sha256=S7j4UacmVIJ3fU8cymyAoK49eHcpWIKTOyUQiEjcbzQ,721
268
- keras_hub/src/models/moonshine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
270
+ keras_hub/src/models/moonshine/__init__.py,sha256=WK_9Cy1dp5KplNAaTsaJbd-2DGLsiHQsIL5ZnXuCbDQ,275
269
271
  keras_hub/src/models/moonshine/moonshine_audio_converter.py,sha256=FnvR7SP44uVOsA3g9azUhQjsVg809eJ5nqoJZQ-DAq0,11854
270
- keras_hub/src/models/moonshine/moonshine_audio_to_text.py,sha256=295kTM-XfUqb5mYjVSApKzMGPtnRyQdwynqqcPS7a_M,15860
271
- keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py,sha256=TZPvyMcPJ4Ojjv7r6ZUeafssIIVtFvPPzdiRHkK9O_A,10002
272
+ keras_hub/src/models/moonshine/moonshine_audio_to_text.py,sha256=dXFtjaxL1jpcIAiiZY1-kcNL-S4RiRJiAC2uR_a3Fyc,15865
273
+ keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py,sha256=hTw941ww8cJrP5DRrxv2DtZUNLJ9A3cayFhnsG5Ef4g,10016
272
274
  keras_hub/src/models/moonshine/moonshine_backbone.py,sha256=XtRUBe_VusXsFRk7-t1JNXM0lxp2UBOJk9v7gfTNDhA,19623
273
275
  keras_hub/src/models/moonshine/moonshine_decoder.py,sha256=Exf5Gg1gsCBST53wxOgBetKkhjS8E8QIUIlUwHlOkIY,11816
274
276
  keras_hub/src/models/moonshine/moonshine_encoder.py,sha256=NjjMO_FEBlWFSv6Appv8a3V7XovW2afvxxjXwQRgV60,8148
@@ -499,7 +501,7 @@ keras_hub/src/utils/transformers/preset_loader.py,sha256=1nfS5xVsl-JROGXJXltTqV1
499
501
  keras_hub/src/utils/transformers/safetensor_utils.py,sha256=CYUHyA4y-B61r7NDnCsFb4t_UmSwZ1k9L-8gzEd6KRg,3339
500
502
  keras_hub/tokenizers/__init__.py,sha256=uMjjm0mzUkRb0e4Ac_JK8aJ9cKGUi5UqmzWoWAFJprE,4164
501
503
  keras_hub/utils/__init__.py,sha256=jXPqVGBpJr_PpYmqD8aDG-fRMlxH-ulqCR2SZMn288Y,646
502
- keras_hub_nightly-0.21.0.dev202505230409.dist-info/METADATA,sha256=i-P2LqVLiVN0cIr63OvwxpAmqn2sGBPSUHDqbHiFhcg,7393
503
- keras_hub_nightly-0.21.0.dev202505230409.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
504
- keras_hub_nightly-0.21.0.dev202505230409.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
505
- keras_hub_nightly-0.21.0.dev202505230409.dist-info/RECORD,,
504
+ keras_hub_nightly-0.21.0.dev202505240409.dist-info/METADATA,sha256=BJHRD68RtZc8CA6kIFWZxphjYr6g2t62j1FvwLar_LU,7393
505
+ keras_hub_nightly-0.21.0.dev202505240409.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
506
+ keras_hub_nightly-0.21.0.dev202505240409.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
507
+ keras_hub_nightly-0.21.0.dev202505240409.dist-info/RECORD,,