sonusai 0.16.1__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/audiofe.py +52 -17
- sonusai/calc_metric_spenh-save.py +1334 -0
- sonusai/calc_metric_spenh.py +1 -1
- sonusai/onnx_predict-old.py +240 -0
- sonusai/onnx_predict-save.py +487 -0
- sonusai/onnx_predict.py +448 -194
- sonusai/ovino_predict.py +508 -0
- sonusai/ovino_query_devices.py +47 -0
- sonusai/torchl_onnx-old.py +216 -0
- sonusai/utils/onnx_utils.py +128 -39
- {sonusai-0.16.1.dist-info → sonusai-0.17.0.dist-info}/METADATA +1 -1
- {sonusai-0.16.1.dist-info → sonusai-0.17.0.dist-info}/RECORD +14 -8
- {sonusai-0.16.1.dist-info → sonusai-0.17.0.dist-info}/WHEEL +1 -1
- {sonusai-0.16.1.dist-info → sonusai-0.17.0.dist-info}/entry_points.txt +0 -0
sonusai/audiofe.py
CHANGED
@@ -12,7 +12,7 @@ options:
|
|
12
12
|
-m MODEL, --model MODEL PL model .py file path.
|
13
13
|
-k CKPT, --checkpoint CKPT PL checkpoint file with weights.
|
14
14
|
-a ASR, --asr ASR ASR method to use.
|
15
|
-
-w WMODEL, --whisper WMODEL
|
15
|
+
-w WMODEL, --whisper WMODEL Model used in whisper, aixplain_whisper and faster_whisper methods. [default: tiny].
|
16
16
|
|
17
17
|
Aaware SonusAI Audio Front End.
|
18
18
|
|
@@ -29,7 +29,7 @@ audiofe_capture_<TIMESTAMP>.png and predict data (time-domain signal and feature
|
|
29
29
|
audiofe_predict_<TIMESTAMP>.png.
|
30
30
|
|
31
31
|
If an ASR is specified, run ASR on the captured audio and print the results. In addition, if a model was also specified,
|
32
|
-
run ASR on the predict audio and print the results.
|
32
|
+
run ASR on the predict audio and print the results. Examples: faster_whisper, google,
|
33
33
|
|
34
34
|
If the debug option is enabled, write capture audio, feature, reconstruct audio, predict, and predict audio to
|
35
35
|
audiofe_<TIMESTAMP>.h5.
|
@@ -79,6 +79,7 @@ def main() -> None:
|
|
79
79
|
import torch
|
80
80
|
from docopt import printable_usage
|
81
81
|
from sonusai_torchl.utils import load_torchl_ckpt_model
|
82
|
+
from sonusai.utils.onnx_utils import load_ort_session
|
82
83
|
|
83
84
|
from sonusai import create_file_handler
|
84
85
|
from sonusai import initial_log_messages
|
@@ -102,9 +103,32 @@ def main() -> None:
|
|
102
103
|
predict_png = predict_name + '.png'
|
103
104
|
h5_name = f'audiofe_{ts}.h5'
|
104
105
|
|
105
|
-
if model_name is not None
|
106
|
-
|
107
|
-
|
106
|
+
if model_name is not None:
|
107
|
+
from os.path import splitext
|
108
|
+
if splitext(model_name)[1] == '.onnx':
|
109
|
+
session, options, model_root, hparams, sess_inputs, sess_outputs = load_ort_session(model_name)
|
110
|
+
if hparams is None:
|
111
|
+
logger.error(f'Error: onnx model does not have required SonusAI hyper-parameters, can not proceed.')
|
112
|
+
raise SystemExit(1)
|
113
|
+
feature_mode = hparams["feature"]
|
114
|
+
model_is_onnx = True
|
115
|
+
in0name = sess_inputs[0].name
|
116
|
+
in0type = sess_inputs[0].type
|
117
|
+
out0name = sess_outputs[0].name
|
118
|
+
out_names = [n.name for n in session.get_outputs()]
|
119
|
+
if in0type.find('float16') != -1:
|
120
|
+
model_is_fp16 = True
|
121
|
+
logger.info(f'Detected input of float16, converting all feature inputs to that type.')
|
122
|
+
else:
|
123
|
+
model_is_fp16 = False
|
124
|
+
else:
|
125
|
+
model_is_onnx = False
|
126
|
+
if ckpt_name is None:
|
127
|
+
print(printable_usage(trim_docstring(__doc__)))
|
128
|
+
exit(1)
|
129
|
+
model = load_torchl_ckpt_model(model_name=model_name, ckpt_name=ckpt_name)
|
130
|
+
feature_mode = model.hparams.feature
|
131
|
+
model.eval()
|
108
132
|
|
109
133
|
# Setup logging file
|
110
134
|
create_file_handler('audiofe.log')
|
@@ -129,26 +153,25 @@ def main() -> None:
|
|
129
153
|
except ValueError as e:
|
130
154
|
logger.exception(e)
|
131
155
|
return
|
156
|
+
# Only write if capture, not for file input
|
157
|
+
write_wav(capture_wav, capture_audio, SAMPLE_RATE)
|
158
|
+
logger.info('')
|
159
|
+
logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_wav}')
|
132
160
|
|
133
|
-
write_wav(capture_wav, capture_audio, SAMPLE_RATE)
|
134
|
-
logger.info('')
|
135
|
-
logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_wav}')
|
136
161
|
if debug:
|
137
162
|
with h5py.File(h5_name, 'a') as f:
|
138
163
|
if 'capture_audio' in f:
|
139
164
|
del f['capture_audio']
|
140
165
|
f.create_dataset('capture_audio', data=capture_audio)
|
141
|
-
logger.info(f'Wrote capture
|
166
|
+
logger.info(f'Wrote capture feature data with shape {capture_audio.shape} to {h5_name}')
|
142
167
|
|
143
168
|
if asr_name is not None:
|
169
|
+
logger.info(f'Running ASR on captured audio with {asr_name} ...')
|
144
170
|
capture_asr = calc_asr(capture_audio, engine=asr_name, whisper_model_name=whisper_name).text
|
145
171
|
logger.info(f'Capture audio ASR: {capture_asr}')
|
146
172
|
|
147
173
|
if model_name is not None:
|
148
|
-
|
149
|
-
model.eval()
|
150
|
-
|
151
|
-
feature = get_feature_from_audio(audio=capture_audio, feature_mode=model.hparams.feature)
|
174
|
+
feature = get_feature_from_audio(audio=capture_audio, feature_mode=feature_mode) #frames x stride x feat_params
|
152
175
|
save_figure(capture_png, capture_audio, feature)
|
153
176
|
logger.info(f'Wrote capture plots to {capture_png}')
|
154
177
|
|
@@ -159,9 +182,20 @@ def main() -> None:
|
|
159
182
|
f.create_dataset('feature', data=feature)
|
160
183
|
logger.info(f'Wrote feature with shape {feature.shape} to {h5_name}')
|
161
184
|
|
162
|
-
|
163
|
-
#
|
164
|
-
|
185
|
+
if model_is_onnx:
|
186
|
+
# run ort session, wants i.e. batch x tsteps x feat_params, outputs numpy BxTxFP or BxFP
|
187
|
+
# Note full reshape not needed here since we assume speech enhanement type model, so a transpose suffices
|
188
|
+
if model_is_fp16:
|
189
|
+
feature = np.float16(feature)
|
190
|
+
# run inference, ort session wants i.e. batch x tsteps x feat_params, outputs numpy BxTxFP or BxFP
|
191
|
+
predict = np.transpose(session.run(out_names, {in0name: np.transpose(feature,(1,0,2))})[0],(1,0,2))
|
192
|
+
else:
|
193
|
+
with torch.no_grad():
|
194
|
+
# model wants batch x timesteps x feature_parameters
|
195
|
+
predict = model(torch.tensor(feature).permute((1, 0, 2))).permute(1, 0, 2).numpy()
|
196
|
+
|
197
|
+
|
198
|
+
|
165
199
|
if debug:
|
166
200
|
with h5py.File(h5_name, 'a') as f:
|
167
201
|
if 'predict' in f:
|
@@ -169,7 +203,7 @@ def main() -> None:
|
|
169
203
|
f.create_dataset('predict', data=predict)
|
170
204
|
logger.info(f'Wrote predict with shape {predict.shape} to {h5_name}')
|
171
205
|
|
172
|
-
predict_audio = get_audio_from_feature(feature=predict, feature_mode=
|
206
|
+
predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
|
173
207
|
write_wav(predict_wav, predict_audio, SAMPLE_RATE)
|
174
208
|
logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_wav}')
|
175
209
|
if debug:
|
@@ -183,6 +217,7 @@ def main() -> None:
|
|
183
217
|
logger.info(f'Wrote predict plots to {predict_png}')
|
184
218
|
|
185
219
|
if asr_name is not None:
|
220
|
+
logger.info(f'Running ASR on model-enhanced audio with {asr_name} ...')
|
186
221
|
predict_asr = calc_asr(predict_audio, engine=asr_name, whisper_model_name=whisper_name).text
|
187
222
|
logger.info(f'Predict audio ASR: {predict_asr}')
|
188
223
|
|