PyPI - sonusai - Versions diffs - 0.16.0__tar.gz → 0.17.0__tar.gz - Mend

sonusai 0.16.0tar.gz → 0.17.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

{sonusai-0.16.0 → sonusai-0.17.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonusai
-Version: 0.16.0
+Version: 0.17.0
 Summary: Framework for building deep neural network models for sound, speech, and voice AI
 Home-page: https://aaware.com
 License: GPL-3.0-only

{sonusai-0.16.0 → sonusai-0.17.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sonusai"
-version = "0.16.0"
+version = "0.17.0"
 description = "Framework for building deep neural network models for sound, speech, and voice AI"
 authors = ["Chris Eddington <chris@aaware.com>", "Jason Calderwood <jason@aaware.com>"]
 maintainers = ["Chris Eddington <chris@aaware.com>", "Jason Calderwood <jason@aaware.com>"]
@@ -40,10 +40,13 @@ torchaudio = "~2.2"
 tqdm = "^4.66.1"
 [tool.poetry.group.dev.dependencies]
+einops = "^0.8.0"
 icecream = "^2.1.3"
 mypy = "^1.6.0"
 mypy-extensions = "^1.0.0"
 pytest = "^8.1.1"
+sonusai-asr-cloud = "^0.1.0"
+sonusai-torchl = "^0.1.0"
 types-pyyaml = "^6.0.12.12"
 types-requests = "^2.31.0.8"

{sonusai-0.16.0 → sonusai-0.17.0}/sonusai/__init__.py RENAMED Viewed

@@ -19,6 +19,7 @@ commands_doc = """
    onnx_predict                 Run ONNX predict on a trained model
    plot                         Plot mixture data
    post_spenh_targetf           Run post-processing for speech enhancement targetf data
+   summarize_metric_spenh       Summarize speech enhancement and analysis results
    tplot                        Plot truth data
    vars                         List custom SonusAI variables
 """

{sonusai-0.16.0 → sonusai-0.17.0}/sonusai/audiofe.py RENAMED Viewed

@@ -12,7 +12,7 @@ options:
     -m MODEL, --model MODEL         PL model .py file path.
     -k CKPT, --checkpoint CKPT      PL checkpoint file with weights.
     -a ASR, --asr ASR               ASR method to use.
-    -w WMODEL, --whisper WMODEL     Whisper model used in aixplain_whisper and whisper methods. [default: tiny].
+    -w WMODEL, --whisper WMODEL     Model used in whisper, aixplain_whisper and faster_whisper methods. [default: tiny].
 Aaware SonusAI Audio Front End.
@@ -24,47 +24,43 @@ audiofe_capture_<TIMESTAMP>.wav.
 If a model is specified, run prediction on audio data from this model. Then compute the inverse transform of the
 prediction result and save to audiofe_predict_<TIMESTAMP>.wav.
+Also, if a model is specified, save plots of the capture data (time-domain signal and feature) to
+audiofe_capture_<TIMESTAMP>.png and predict data (time-domain signal and feature) to
+audiofe_predict_<TIMESTAMP>.png.
 If an ASR is specified, run ASR on the captured audio and print the results. In addition, if a model was also specified,
-run ASR on the predict audio and print the results.
+run ASR on the predict audio and print the results.  Examples: faster_whisper, google,
 If the debug option is enabled, write capture audio, feature, reconstruct audio, predict, and predict audio to
 audiofe_<TIMESTAMP>.h5.
 """
-from os.path import exists
-from select import select
-from sys import stdin
+import signal
-import h5py
 import numpy as np
-import pyaudio
-import torch
-from docopt import docopt
-from docopt import printable_usage
-import sonusai
-from sonusai import create_file_handler
-from sonusai import initial_log_messages
-from sonusai import logger
-from sonusai import update_console_handler
 from sonusai.mixture import AudioT
-from sonusai.mixture import CHANNEL_COUNT
-from sonusai.mixture import SAMPLE_RATE
-from sonusai.mixture import get_audio_from_feature
-from sonusai.mixture import get_feature_from_audio
-from sonusai.mixture import read_audio
-from sonusai.utils import calc_asr
-from sonusai.utils import create_timestamp
-from sonusai.utils import get_input_device_index_by_name
-from sonusai.utils import get_input_devices
-from sonusai.utils import load_torchl_ckpt_model
-from sonusai.utils import trim_docstring
-from sonusai.utils import write_wav
+def signal_handler(_sig, _frame):
+    import sys
+    from sonusai import logger
+    logger.info('Canceled due to keyboard interrupt')
+    sys.exit(1)
+signal.signal(signal.SIGINT, signal_handler)
 def main() -> None:
+    from docopt import docopt
+    import sonusai
+    from sonusai.utils import trim_docstring
     args = docopt(trim_docstring(__doc__), version=sonusai.__version__, options_first=True)
-    ts = create_timestamp()
     verbose = args['--verbose']
     length = float(args['--length'])
@@ -76,13 +72,63 @@ def main() -> None:
     debug = args['--debug']
     show = args['--show']
-    capture_name = f'audiofe_capture_{ts}.wav'
-    predict_name = f'audiofe_predict_{ts}.wav'
+    from os.path import exists
+    import h5py
+    import pyaudio
+    import torch
+    from docopt import printable_usage
+    from sonusai_torchl.utils import load_torchl_ckpt_model
+    from sonusai.utils.onnx_utils import load_ort_session
+    from sonusai import create_file_handler
+    from sonusai import initial_log_messages
+    from sonusai import logger
+    from sonusai import update_console_handler
+    from sonusai.mixture import SAMPLE_RATE
+    from sonusai.mixture import get_audio_from_feature
+    from sonusai.mixture import get_feature_from_audio
+    from sonusai.utils import calc_asr
+    from sonusai.utils import create_timestamp
+    from sonusai.utils import get_input_devices
+    from sonusai.utils import trim_docstring
+    from sonusai.utils import write_wav
+    ts = create_timestamp()
+    capture_name = f'audiofe_capture_{ts}'
+    capture_wav = capture_name + '.wav'
+    capture_png = capture_name + '.png'
+    predict_name = f'audiofe_predict_{ts}'
+    predict_wav = predict_name + '.wav'
+    predict_png = predict_name + '.png'
     h5_name = f'audiofe_{ts}.h5'
-    if model_name is not None and ckpt_name is None:
-        print(printable_usage(trim_docstring(__doc__)))
-        exit(1)
+    if model_name is not None:
+        from os.path import splitext
+        if splitext(model_name)[1] == '.onnx':
+            session, options, model_root, hparams, sess_inputs, sess_outputs = load_ort_session(model_name)
+            if hparams is None:
+                logger.error(f'Error: onnx model does not have required SonusAI hyper-parameters, can not proceed.')
+                raise SystemExit(1)
+            feature_mode = hparams["feature"]
+            model_is_onnx = True
+            in0name = sess_inputs[0].name
+            in0type = sess_inputs[0].type
+            out0name = sess_outputs[0].name
+            out_names = [n.name for n in session.get_outputs()]
+            if in0type.find('float16') != -1:
+                model_is_fp16 = True
+                logger.info(f'Detected input of float16, converting all feature inputs to that type.')
+            else:
+                model_is_fp16 = False
+        else:
+            model_is_onnx = False
+            if ckpt_name is None:
+                print(printable_usage(trim_docstring(__doc__)))
+                exit(1)
+            model = load_torchl_ckpt_model(model_name=model_name, ckpt_name=ckpt_name)
+            feature_mode = model.hparams.feature
+            model.eval()
     # Setup logging file
     create_file_handler('audiofe.log')
@@ -107,26 +153,28 @@ def main() -> None:
         except ValueError as e:
             logger.exception(e)
             return
+        # Only write if capture, not for file input
+        write_wav(capture_wav, capture_audio, SAMPLE_RATE)
+        logger.info('')
+        logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_wav}')
-    write_wav(capture_name, capture_audio, SAMPLE_RATE)
-    logger.info('')
-    logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_name}')
     if debug:
         with h5py.File(h5_name, 'a') as f:
             if 'capture_audio' in f:
                 del f['capture_audio']
             f.create_dataset('capture_audio', data=capture_audio)
-        logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {h5_name}')
+        logger.info(f'Wrote capture feature data with shape {capture_audio.shape} to {h5_name}')
     if asr_name is not None:
+        logger.info(f'Running ASR on captured audio with {asr_name} ...')
         capture_asr = calc_asr(capture_audio, engine=asr_name, whisper_model_name=whisper_name).text
         logger.info(f'Capture audio ASR: {capture_asr}')
     if model_name is not None:
-        model = load_torchl_ckpt_model(model_name=model_name, ckpt_name=ckpt_name)
-        model.eval()
+        feature = get_feature_from_audio(audio=capture_audio, feature_mode=feature_mode)  #frames x stride x feat_params
+        save_figure(capture_png, capture_audio, feature)
+        logger.info(f'Wrote capture plots to {capture_png}')
-        feature = get_feature_from_audio(audio=capture_audio, feature_mode=model.hparams.feature)
         if debug:
             with h5py.File(h5_name, 'a') as f:
                 if 'feature' in f:
@@ -134,23 +182,20 @@ def main() -> None:
                 f.create_dataset('feature', data=feature)
             logger.info(f'Wrote feature with shape {feature.shape} to {h5_name}')
-        # if debug:
-        #     reconstruct_name = f'audiofe_reconstruct_{ts}.wav'
-        #     reconstruct_audio = get_audio_from_feature(feature=feature, feature_mode=model.hparams.feature)
-        #     samples = min(len(capture_audio), len(reconstruct_audio))
-        #     max_err = np.max(np.abs(capture_audio[:samples] - reconstruct_audio[:samples]))
-        #     logger.info(f'Maximum error between capture and reconstruct: {max_err}')
-        #     write_wav(reconstruct_name, reconstruct_audio, SAMPLE_RATE)
-        #     logger.info(f'Wrote reconstruct audio with shape {reconstruct_audio.shape} to {reconstruct_name}')
-        #     with h5py.File(h5_name, 'a') as f:
-        #         if 'reconstruct_audio' in f:
-        #             del f['reconstruct_audio']
-        #         f.create_dataset('reconstruct_audio', data=reconstruct_audio)
-        #     logger.info(f'Wrote reconstruct audio with shape {reconstruct_audio.shape} to {h5_name}')
-        with torch.no_grad():
-            # model wants batch x timesteps x feature_parameters
-            predict = model(torch.tensor(feature).permute((1, 0, 2))).permute(1, 0, 2).numpy()
+        if model_is_onnx:
+            # run ort session, wants i.e. batch x tsteps x feat_params, outputs numpy BxTxFP or BxFP
+            # Note full reshape not needed here since we assume speech enhanement type model, so a transpose suffices
+            if model_is_fp16:
+                feature = np.float16(feature)
+            # run inference, ort session wants i.e. batch x tsteps x feat_params, outputs numpy BxTxFP or BxFP
+            predict = np.transpose(session.run(out_names, {in0name: np.transpose(feature,(1,0,2))})[0],(1,0,2))
+        else:
+            with torch.no_grad():
+                # model wants batch x timesteps x feature_parameters
+                predict = model(torch.tensor(feature).permute((1, 0, 2))).permute(1, 0, 2).numpy()
         if debug:
             with h5py.File(h5_name, 'a') as f:
                 if 'predict' in f:
@@ -158,9 +203,9 @@ def main() -> None:
                 f.create_dataset('predict', data=predict)
             logger.info(f'Wrote predict with shape {predict.shape} to {h5_name}')
-        predict_audio = get_audio_from_feature(feature=predict, feature_mode=model.hparams.feature)
-        write_wav(predict_name, predict_audio, SAMPLE_RATE)
-        logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_name}')
+        predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
+        write_wav(predict_wav, predict_audio, SAMPLE_RATE)
+        logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_wav}')
         if debug:
             with h5py.File(h5_name, 'a') as f:
                 if 'predict_audio' in f:
@@ -168,12 +213,27 @@ def main() -> None:
                 f.create_dataset('predict_audio', data=predict_audio)
             logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {h5_name}')
+        save_figure(predict_png, predict_audio, predict)
+        logger.info(f'Wrote predict plots to {predict_png}')
         if asr_name is not None:
+            logger.info(f'Running ASR on model-enhanced audio with {asr_name} ...')
             predict_asr = calc_asr(predict_audio, engine=asr_name, whisper_model_name=whisper_name).text
             logger.info(f'Predict audio ASR: {predict_asr}')
 def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1024) -> AudioT:
+    from select import select
+    from sys import stdin
+    import pyaudio
+    from sonusai import logger
+    from sonusai.mixture import CHANNEL_COUNT
+    from sonusai.mixture import SAMPLE_RATE
+    from sonusai.utils import get_input_device_index_by_name
+    from sonusai.utils import get_input_devices
     p = pyaudio.PyAudio()
     input_devices = get_input_devices(p)
@@ -224,6 +284,10 @@ def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1
 def get_frames_from_file(input_name: str, length: float) -> AudioT:
+    from sonusai import logger
+    from sonusai.mixture import SAMPLE_RATE
+    from sonusai.mixture import read_audio
     logger.info(f'Capturing from {input_name}')
     frames = read_audio(input_name)
     if length != -1:
@@ -233,5 +297,37 @@ def get_frames_from_file(input_name: str, length: float) -> AudioT:
     return frames
+def save_figure(name: str, audio: np.ndarray, feature: np.ndarray) -> None:
+    import matplotlib.pyplot as plt
+    from scipy.interpolate import CubicSpline
+    from sonusai.mixture import SAMPLE_RATE
+    from sonusai.utils import unstack_complex
+    spectrum = 20 * np.log(np.abs(np.squeeze(unstack_complex(feature)).transpose()))
+    frames = spectrum.shape[1]
+    samples = (len(audio) // frames) * frames
+    length_in_s = samples / SAMPLE_RATE
+    interp = samples // frames
+    ts = np.arange(0.0, length_in_s, interp / SAMPLE_RATE)
+    t = np.arange(0.0, length_in_s, 1 / SAMPLE_RATE)
+    spectrum = CubicSpline(ts, spectrum, axis=-1)(t)
+    fig, (ax1, ax2) = plt.subplots(nrows=2)
+    ax1.set_title(name)
+    ax1.plot(t, audio[:samples])
+    ax1.set_ylabel('Signal')
+    ax1.set_xlim(0, length_in_s)
+    ax1.set_ylim(-1, 1)
+    ax2.imshow(spectrum, origin='lower', aspect='auto')
+    ax2.set_xticks([])
+    ax2.set_ylabel('Feature')
+    plt.savefig(name, dpi=300)
 if __name__ == '__main__':
     main()

sonusai 0.16.0__tar.gz → 0.17.0__tar.gz

sonusai 0.16.0tar.gz → 0.17.0tar.gz