PyPI - sonusai - Versions diffs - 0.17.0__py3-none-any.whl → 0.17.3__py3-none-any.whl - Mend

sonusai 0.17.0py3-none-any.whl → 0.17.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

sonusai/audiofe.py +25 -54
sonusai/calc_metric_spenh.py +212 -219
sonusai/doc/doc.py +1 -1
sonusai/mixture/__init__.py +2 -0
sonusai/mixture/audio.py +12 -0
sonusai/mixture/datatypes.py +11 -3
sonusai/mixture/mixdb.py +100 -0
sonusai/mixture/soundfile_audio.py +39 -0
sonusai/mixture/sox_augmentation.py +3 -0
sonusai/mixture/speaker_metadata.py +35 -0
sonusai/mixture/torchaudio_audio.py +22 -0
sonusai/mkmanifest.py +1 -1
sonusai/mkwav.py +4 -4
sonusai/onnx_predict.py +114 -410
sonusai/post_spenh_targetf.py +2 -2
sonusai/queries/queries.py +1 -1
sonusai/speech/__init__.py +3 -0
sonusai/speech/l2arctic.py +116 -0
sonusai/speech/librispeech.py +99 -0
sonusai/speech/mcgill.py +70 -0
sonusai/speech/textgrid.py +100 -0
sonusai/speech/timit.py +135 -0
sonusai/speech/types.py +12 -0
sonusai/speech/vctk.py +52 -0
sonusai/speech/voxceleb.py +102 -0
sonusai/utils/__init__.py +3 -2
sonusai/utils/asr_functions/aaware_whisper.py +2 -2
sonusai/utils/asr_manifest_functions/__init__.py +0 -1
sonusai/utils/asr_manifest_functions/data.py +0 -8
sonusai/utils/asr_manifest_functions/librispeech.py +1 -1
sonusai/utils/asr_manifest_functions/mcgill_speech.py +1 -1
sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +1 -1
sonusai/utils/braced_glob.py +7 -3
sonusai/utils/onnx_utils.py +110 -106
sonusai/utils/path_info.py +7 -0
sonusai/utils/{wave.py → write_audio.py} +2 -2
{sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/METADATA +3 -1
{sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/RECORD +40 -35
{sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/WHEEL +1 -1
sonusai/calc_metric_spenh-save.py +0 -1334
sonusai/onnx_predict-old.py +0 -240
sonusai/onnx_predict-save.py +0 -487
sonusai/ovino_predict.py +0 -508
sonusai/ovino_query_devices.py +0 -47
sonusai/torchl_onnx-old.py +0 -216
{sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/entry_points.txt +0 -0

sonusai/audiofe.py CHANGED Viewed

@@ -1,16 +1,15 @@
 """sonusai audiofe
-usage: audiofe [-hvds] [--version] [-i INPUT] [-l LENGTH] [-m MODEL] [-k CKPT] [-a ASR] [-w WMODEL]
+usage: audiofe [-hvds] [--version] [-i INPUT] [-l LENGTH] [-m MODEL] [-a ASR] [-w WMODEL]
 options:
     -h, --help
     -v, --verbose                   Be verbose.
     -d, --debug                     Write debug data to H5 file.
-    -s, --show                      Show a list of available audio inputs.
+    -s, --show                      Display a list of available audio inputs.
     -i INPUT, --input INPUT         Input audio.
     -l LENGTH, --length LENGTH      Length of audio in seconds. [default: -1].
-    -m MODEL, --model MODEL         PL model .py file path.
-    -k CKPT, --checkpoint CKPT      PL checkpoint file with weights.
+    -m MODEL, --model MODEL         ONNX model.
     -a ASR, --asr ASR               ASR method to use.
     -w WMODEL, --whisper WMODEL     Model used in whisper, aixplain_whisper and faster_whisper methods. [default: tiny].
@@ -66,7 +65,6 @@ def main() -> None:
     length = float(args['--length'])
     input_name = args['--input']
     model_name = args['--model']
-    ckpt_name = args['--checkpoint']
     asr_name = args['--asr']
     whisper_name = args['--whisper']
     debug = args['--debug']
@@ -76,10 +74,6 @@ def main() -> None:
     import h5py
     import pyaudio
-    import torch
-    from docopt import printable_usage
-    from sonusai_torchl.utils import load_torchl_ckpt_model
-    from sonusai.utils.onnx_utils import load_ort_session
     from sonusai import create_file_handler
     from sonusai import initial_log_messages
@@ -91,8 +85,8 @@ def main() -> None:
     from sonusai.utils import calc_asr
     from sonusai.utils import create_timestamp
     from sonusai.utils import get_input_devices
-    from sonusai.utils import trim_docstring
-    from sonusai.utils import write_wav
+    from sonusai.utils import load_ort_session
+    from sonusai.utils import write_audio
     ts = create_timestamp()
     capture_name = f'audiofe_capture_{ts}'
@@ -103,33 +97,6 @@ def main() -> None:
     predict_png = predict_name + '.png'
     h5_name = f'audiofe_{ts}.h5'
-    if model_name is not None:
-        from os.path import splitext
-        if splitext(model_name)[1] == '.onnx':
-            session, options, model_root, hparams, sess_inputs, sess_outputs = load_ort_session(model_name)
-            if hparams is None:
-                logger.error(f'Error: onnx model does not have required SonusAI hyper-parameters, can not proceed.')
-                raise SystemExit(1)
-            feature_mode = hparams["feature"]
-            model_is_onnx = True
-            in0name = sess_inputs[0].name
-            in0type = sess_inputs[0].type
-            out0name = sess_outputs[0].name
-            out_names = [n.name for n in session.get_outputs()]
-            if in0type.find('float16') != -1:
-                model_is_fp16 = True
-                logger.info(f'Detected input of float16, converting all feature inputs to that type.')
-            else:
-                model_is_fp16 = False
-        else:
-            model_is_onnx = False
-            if ckpt_name is None:
-                print(printable_usage(trim_docstring(__doc__)))
-                exit(1)
-            model = load_torchl_ckpt_model(model_name=model_name, ckpt_name=ckpt_name)
-            feature_mode = model.hparams.feature
-            model.eval()
     # Setup logging file
     create_file_handler('audiofe.log')
     update_console_handler(verbose)
@@ -153,8 +120,8 @@ def main() -> None:
         except ValueError as e:
             logger.exception(e)
             return
-        # Only write if capture, not for file input
-        write_wav(capture_wav, capture_audio, SAMPLE_RATE)
+        # Only write if capture from device, not for file input
+        write_audio(capture_wav, capture_audio, SAMPLE_RATE)
         logger.info('')
         logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_wav}')
@@ -171,7 +138,17 @@ def main() -> None:
         logger.info(f'Capture audio ASR: {capture_asr}')
     if model_name is not None:
-        feature = get_feature_from_audio(audio=capture_audio, feature_mode=feature_mode)  #frames x stride x feat_params
+        session, options, model_root, hparams, sess_inputs, sess_outputs = load_ort_session(model_name)
+        if hparams is None:
+            logger.error(f'Error: ONNX model does not have required SonusAI hyperparameters, cannot proceed.')
+            raise SystemExit(1)
+        feature_mode = hparams.feature
+        in0name = sess_inputs[0].name
+        in0type = sess_inputs[0].type
+        out_names = [n.name for n in session.get_outputs()]
+        # frames x stride x feat_params
+        feature = get_feature_from_audio(audio=capture_audio, feature_mode=feature_mode)
         save_figure(capture_png, capture_audio, feature)
         logger.info(f'Wrote capture plots to {capture_png}')
@@ -182,19 +159,13 @@ def main() -> None:
                 f.create_dataset('feature', data=feature)
             logger.info(f'Wrote feature with shape {feature.shape} to {h5_name}')
-        if model_is_onnx:
-            # run ort session, wants i.e. batch x tsteps x feat_params, outputs numpy BxTxFP or BxFP
-            # Note full reshape not needed here since we assume speech enhanement type model, so a transpose suffices
-            if model_is_fp16:
-                feature = np.float16(feature)
-            # run inference, ort session wants i.e. batch x tsteps x feat_params, outputs numpy BxTxFP or BxFP
-            predict = np.transpose(session.run(out_names, {in0name: np.transpose(feature,(1,0,2))})[0],(1,0,2))
-        else:
-            with torch.no_grad():
-                # model wants batch x timesteps x feature_parameters
-                predict = model(torch.tensor(feature).permute((1, 0, 2))).permute(1, 0, 2).numpy()
+        if in0type.find('float16') != -1:
+            logger.info(f'Detected input of float16, converting all feature inputs to that type.')
+            feature = np.float16(feature)  # type: ignore
+        # Run inference, ort session wants batch x timesteps x feat_params, outputs numpy BxTxFP or BxFP
+        # Note full reshape not needed here since we assume speech enhancement type model, so a transpose suffices
+        predict = np.transpose(session.run(out_names, {in0name: np.transpose(feature, (1, 0, 2))})[0], (1, 0, 2))
         if debug:
             with h5py.File(h5_name, 'a') as f:
@@ -204,7 +175,7 @@ def main() -> None:
             logger.info(f'Wrote predict with shape {predict.shape} to {h5_name}')
         predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
-        write_wav(predict_wav, predict_audio, SAMPLE_RATE)
+        write_audio(predict_wav, predict_audio, SAMPLE_RATE)
         logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_wav}')
         if debug:
             with h5py.File(h5_name, 'a') as f:

sonusai 0.17.0__py3-none-any.whl → 0.17.3__py3-none-any.whl

sonusai 0.17.0py3-none-any.whl → 0.17.3py3-none-any.whl