sonusai 0.20.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +16 -3
- sonusai/audiofe.py +240 -76
- sonusai/calc_metric_spenh.py +71 -73
- sonusai/config/__init__.py +3 -0
- sonusai/config/config.py +61 -0
- sonusai/config/config.yml +20 -0
- sonusai/config/constants.py +8 -0
- sonusai/constants.py +11 -0
- sonusai/data/genmixdb.yml +21 -36
- sonusai/{mixture/datatypes.py → datatypes.py} +91 -130
- sonusai/deprecated/plot.py +4 -5
- sonusai/doc/doc.py +4 -4
- sonusai/doc.py +11 -4
- sonusai/genft.py +43 -45
- sonusai/genmetrics.py +23 -19
- sonusai/genmix.py +54 -82
- sonusai/genmixdb.py +88 -264
- sonusai/ir_metric.py +30 -34
- sonusai/lsdb.py +41 -48
- sonusai/main.py +15 -22
- sonusai/metrics/calc_audio_stats.py +4 -17
- sonusai/metrics/calc_class_weights.py +4 -4
- sonusai/metrics/calc_optimal_thresholds.py +8 -5
- sonusai/metrics/calc_pesq.py +2 -2
- sonusai/metrics/calc_segsnr_f.py +4 -4
- sonusai/metrics/calc_speech.py +25 -13
- sonusai/metrics/class_summary.py +7 -7
- sonusai/metrics/confusion_matrix_summary.py +5 -5
- sonusai/metrics/one_hot.py +4 -4
- sonusai/metrics/snr_summary.py +7 -7
- sonusai/metrics_summary.py +38 -45
- sonusai/mixture/__init__.py +5 -104
- sonusai/mixture/audio.py +10 -39
- sonusai/mixture/class_balancing.py +103 -0
- sonusai/mixture/config.py +251 -271
- sonusai/mixture/constants.py +35 -39
- sonusai/mixture/data_io.py +25 -36
- sonusai/mixture/db_datatypes.py +58 -22
- sonusai/mixture/effects.py +386 -0
- sonusai/mixture/feature.py +7 -11
- sonusai/mixture/generation.py +484 -611
- sonusai/mixture/helpers.py +82 -184
- sonusai/mixture/ir_delay.py +3 -4
- sonusai/mixture/ir_effects.py +77 -0
- sonusai/mixture/log_duration_and_sizes.py +6 -12
- sonusai/mixture/mixdb.py +931 -669
- sonusai/mixture/pad_audio.py +35 -0
- sonusai/mixture/resample.py +7 -0
- sonusai/mixture/sox_effects.py +195 -0
- sonusai/mixture/sox_help.py +650 -0
- sonusai/mixture/spectral_mask.py +2 -2
- sonusai/mixture/truth.py +17 -15
- sonusai/mixture/truth_functions/crm.py +12 -12
- sonusai/mixture/truth_functions/energy.py +22 -22
- sonusai/mixture/truth_functions/file.py +5 -5
- sonusai/mixture/truth_functions/metadata.py +4 -4
- sonusai/mixture/truth_functions/metrics.py +4 -4
- sonusai/mixture/truth_functions/phoneme.py +3 -3
- sonusai/mixture/truth_functions/sed.py +11 -13
- sonusai/mixture/truth_functions/target.py +10 -10
- sonusai/mkwav.py +26 -29
- sonusai/onnx_predict.py +240 -88
- sonusai/queries/__init__.py +2 -2
- sonusai/queries/queries.py +38 -34
- sonusai/speech/librispeech.py +1 -1
- sonusai/speech/mcgill.py +1 -1
- sonusai/speech/timit.py +2 -2
- sonusai/summarize_metric_spenh.py +10 -17
- sonusai/utils/__init__.py +7 -1
- sonusai/utils/asl_p56.py +2 -2
- sonusai/utils/asr.py +2 -2
- sonusai/utils/asr_functions/aaware_whisper.py +4 -5
- sonusai/utils/choice.py +31 -0
- sonusai/utils/compress.py +1 -1
- sonusai/utils/dataclass_from_dict.py +19 -1
- sonusai/utils/energy_f.py +3 -3
- sonusai/utils/evaluate_random_rule.py +15 -0
- sonusai/utils/keyboard_interrupt.py +12 -0
- sonusai/utils/onnx_utils.py +3 -17
- sonusai/utils/print_mixture_details.py +21 -19
- sonusai/utils/{temp_seed.py → rand.py} +3 -3
- sonusai/utils/read_predict_data.py +2 -2
- sonusai/utils/reshape.py +3 -3
- sonusai/utils/stratified_shuffle_split.py +3 -3
- sonusai/{mixture → utils}/tokenized_shell_vars.py +1 -1
- sonusai/utils/write_audio.py +2 -2
- sonusai/vars.py +11 -4
- {sonusai-0.20.2.dist-info → sonusai-1.0.1.dist-info}/METADATA +4 -2
- sonusai-1.0.1.dist-info/RECORD +138 -0
- sonusai/mixture/augmentation.py +0 -444
- sonusai/mixture/class_count.py +0 -15
- sonusai/mixture/eq_rule_is_valid.py +0 -45
- sonusai/mixture/target_class_balancing.py +0 -107
- sonusai/mixture/targets.py +0 -175
- sonusai-0.20.2.dist-info/RECORD +0 -128
- {sonusai-0.20.2.dist-info → sonusai-1.0.1.dist-info}/WHEEL +0 -0
- {sonusai-0.20.2.dist-info → sonusai-1.0.1.dist-info}/entry_points.txt +0 -0
sonusai/__init__.py
CHANGED
@@ -3,9 +3,6 @@ from importlib import metadata
|
|
3
3
|
from os.path import dirname
|
4
4
|
|
5
5
|
from rich.logging import RichHandler
|
6
|
-
from rich.traceback import install
|
7
|
-
|
8
|
-
install(show_locals=True)
|
9
6
|
|
10
7
|
__version__ = metadata.version(__package__) # pyright: ignore [reportArgumentType]
|
11
8
|
BASEDIR = dirname(__file__)
|
@@ -81,3 +78,19 @@ def commands_list(doc: str = commands_doc) -> list[str]:
|
|
81
78
|
if command:
|
82
79
|
commands.append(command)
|
83
80
|
return commands
|
81
|
+
|
82
|
+
|
83
|
+
def exception_handler(e: Exception) -> None:
|
84
|
+
import sys
|
85
|
+
|
86
|
+
from rich.console import Console
|
87
|
+
|
88
|
+
logger.error(f"{type(e).__name__}: {e}")
|
89
|
+
handlers = [handler for handler in logger.handlers if isinstance(handler, logging.FileHandler)]
|
90
|
+
logger.error(f"See {', '.join(handler.baseFilename for handler in handlers)} for details")
|
91
|
+
|
92
|
+
console = Console(color_system=None)
|
93
|
+
with console.capture() as capture:
|
94
|
+
console.print_exception(show_locals=False)
|
95
|
+
logger.debug(capture.get())
|
96
|
+
sys.exit(1)
|
sonusai/audiofe.py
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
"""sonusai audiofe
|
2
2
|
|
3
|
-
usage: audiofe [-
|
3
|
+
usage: audiofe [-hvdsp] [--version] [-i INPUT] [-l LENGTH] [-a ASR] [-n NOISEDB]
|
4
|
+
[-w WMODEL] [-o FEATURE] MODEL
|
4
5
|
|
5
6
|
options:
|
6
7
|
-h, --help
|
7
8
|
-v, --verbose Be verbose.
|
8
9
|
-d, --debug Write debug data to H5 file.
|
9
10
|
-s, --show Display a list of available audio inputs.
|
10
|
-
-i INPUT, --input INPUT
|
11
|
+
-i INPUT, --input INPUT Audio source from ALSA or .wav file. See -s or arecord -L. [default: default]
|
11
12
|
-l LENGTH, --length LENGTH Length of audio in seconds. [default: -1].
|
12
|
-
-m MODEL, --model MODEL ONNX model.
|
13
|
+
-m MODEL, --model MODEL SonusAI ONNX model applied to the captured audio.
|
14
|
+
-n NOISEDB, --noiseadd NOISEDB Amount of noise to keep in clean audio output. [default: -30]
|
15
|
+
-p, --playback Enable playback of noisy audio, then the model prediction output audio
|
13
16
|
-a ASR, --asr ASR ASR method to use.
|
14
17
|
-w WMODEL, --whisper WMODEL Model used in whisper, aixplain_whisper and faster_whisper methods. [default: tiny].
|
18
|
+
-o FEATURE, --feature-overlap Run SonusAI model in overlap-streaming mode using FEATURE which is an 8-10 character
|
19
|
+
string specifying a stride-overlap feature of the same type as the model, i.e. a
|
20
|
+
model with default feature of hun00ns1 could use hun00nv80 or hun00nv128, etc.
|
15
21
|
|
16
22
|
Aaware SonusAI Audio Front End.
|
17
23
|
|
@@ -35,68 +41,40 @@ audiofe_<TIMESTAMP>.h5.
|
|
35
41
|
|
36
42
|
"""
|
37
43
|
|
38
|
-
import signal
|
39
|
-
|
40
44
|
import numpy as np
|
41
45
|
|
42
46
|
from sonusai.mixture import AudioT
|
43
47
|
|
44
48
|
|
45
|
-
def signal_handler(_sig, _frame):
|
46
|
-
import sys
|
47
|
-
|
48
|
-
from sonusai import logger
|
49
|
-
|
50
|
-
logger.info("Canceled due to keyboard interrupt")
|
51
|
-
sys.exit(1)
|
52
|
-
|
53
|
-
|
54
|
-
signal.signal(signal.SIGINT, signal_handler)
|
55
|
-
|
56
|
-
|
57
49
|
def main() -> None:
|
58
50
|
from docopt import docopt
|
59
51
|
|
60
|
-
import
|
52
|
+
from sonusai import __version__ as sai_version
|
61
53
|
from sonusai.utils import trim_docstring
|
62
54
|
|
63
|
-
args = docopt(trim_docstring(__doc__), version=
|
55
|
+
args = docopt(trim_docstring(__doc__), version=sai_version, options_first=True)
|
64
56
|
|
65
57
|
verbose = args["--verbose"]
|
66
58
|
length = float(args["--length"])
|
67
59
|
input_name = args["--input"]
|
68
|
-
|
60
|
+
feature_ovr = args["--feature-overlap"]
|
69
61
|
asr_name = args["--asr"]
|
70
62
|
whisper_name = args["--whisper"]
|
71
63
|
debug = args["--debug"]
|
72
64
|
show = args["--show"]
|
65
|
+
playback = args["--playback"]
|
66
|
+
noiseadd = args["--noiseadd"]
|
67
|
+
model_name = args["MODEL"]
|
73
68
|
|
74
|
-
from os.path import exists
|
75
|
-
|
76
|
-
import h5py
|
77
69
|
import pyaudio
|
78
70
|
|
79
71
|
from sonusai import create_file_handler
|
80
72
|
from sonusai import initial_log_messages
|
81
73
|
from sonusai import logger
|
82
74
|
from sonusai import update_console_handler
|
83
|
-
from sonusai.mixture import SAMPLE_RATE
|
84
|
-
from sonusai.mixture import get_audio_from_feature
|
85
|
-
from sonusai.mixture import get_feature_from_audio
|
86
|
-
from sonusai.utils import calc_asr
|
87
75
|
from sonusai.utils import create_timestamp
|
88
76
|
from sonusai.utils import get_input_devices
|
89
77
|
from sonusai.utils import load_ort_session
|
90
|
-
from sonusai.utils import write_audio
|
91
|
-
|
92
|
-
ts = create_timestamp()
|
93
|
-
capture_name = f"audiofe_capture_{ts}"
|
94
|
-
capture_wav = capture_name + ".wav"
|
95
|
-
capture_png = capture_name + ".png"
|
96
|
-
predict_name = f"audiofe_predict_{ts}"
|
97
|
-
predict_wav = predict_name + ".wav"
|
98
|
-
predict_png = predict_name + ".png"
|
99
|
-
h5_name = f"audiofe_{ts}.h5"
|
100
78
|
|
101
79
|
# Setup logging file
|
102
80
|
create_file_handler("audiofe.log")
|
@@ -111,7 +89,91 @@ def main() -> None:
|
|
111
89
|
logger.info(f"{name}")
|
112
90
|
logger.info("")
|
113
91
|
p.terminate()
|
114
|
-
return
|
92
|
+
# return
|
93
|
+
|
94
|
+
ts = create_timestamp()
|
95
|
+
capture_name = f"{ts}-noisy"
|
96
|
+
capture_wav = capture_name + ".wav"
|
97
|
+
capture_png = capture_name + ".png"
|
98
|
+
predict_name = f"{ts}-pred"
|
99
|
+
predict_wav = predict_name + ".wav"
|
100
|
+
predict_png = predict_name + ".png"
|
101
|
+
h5_name = f"{ts}-audiofe.h5"
|
102
|
+
|
103
|
+
if model_name is not None:
|
104
|
+
session, options, model_root, hparams, sess_inputs, sess_outputs = load_ort_session(model_name)
|
105
|
+
if hparams is None:
|
106
|
+
logger.error("Error: ONNX model does not have required SonusAI hyperparameters, cannot proceed.")
|
107
|
+
raise SystemExit(1)
|
108
|
+
feature_mode = hparams["feature"]
|
109
|
+
if feature_ovr is not None:
|
110
|
+
# TBD checks for match and valid feature_ovr
|
111
|
+
stride = int(feature_ovr[7:])
|
112
|
+
sov_type = feature_ovr[6] # v,e,f,t supported, need to calculate stride from tstep
|
113
|
+
if sov_type == "v":
|
114
|
+
feat_step = int(np.ceil(0.5 * stride))
|
115
|
+
elif sov_type == "e":
|
116
|
+
feat_step = int(np.ceil(4 * stride / 5))
|
117
|
+
elif sov_type == "f":
|
118
|
+
feat_step = int(np.ceil(3 * stride / 4))
|
119
|
+
elif sov_type == "t":
|
120
|
+
feat_step = int(np.ceil(2 * stride / 3))
|
121
|
+
else:
|
122
|
+
logger.error("Override feature does not have a supported overlap mode, exiting.")
|
123
|
+
raise SystemExit(1)
|
124
|
+
feature_orig = feature_mode
|
125
|
+
feature_mode = feature_ovr
|
126
|
+
logger.info(
|
127
|
+
f"Overriding feature with {feature_ovr} (was {feature_orig}), with stride={stride}, step={feat_step}."
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
feat_step = 1
|
131
|
+
|
132
|
+
from pyaaware import FeatureGenerator
|
133
|
+
|
134
|
+
fg = FeatureGenerator(feature_mode=feature_mode)
|
135
|
+
ftn = fg.ftransform_length # feature transform length
|
136
|
+
ftr = fg.ftransform_overlap # forward transform samples per step (R)
|
137
|
+
fstride = fg.stride # feature stride
|
138
|
+
fsamples = fstride * ftr # total samples in feature
|
139
|
+
|
140
|
+
in0name = sess_inputs[0].name
|
141
|
+
in0type = sess_inputs[0].type
|
142
|
+
out_names = [n.name for n in session.get_outputs()]
|
143
|
+
if len(sess_inputs) != 1:
|
144
|
+
logger.error(f"Error: ONNX model does not have 1 input, but {len(sess_inputs)}. Exit due to unknown input.")
|
145
|
+
raise SystemExit(1)
|
146
|
+
if verbose:
|
147
|
+
logger.info(f"Read and compiled ONNX model from {model_name}.")
|
148
|
+
import onnx
|
149
|
+
|
150
|
+
omodel = onnx.load(model_name)
|
151
|
+
from sonusai.utils.onnx_utils import get_and_check_inputs
|
152
|
+
from sonusai.utils.onnx_utils import get_and_check_outputs
|
153
|
+
|
154
|
+
logger.info(f"Onnx model uses ir_version {omodel.ir_version}")
|
155
|
+
onnx_inputs, inshapes = get_and_check_inputs(omodel) # Note: logs warning if # inputs > 1
|
156
|
+
logger.info(f"Onnx model input has {len(inshapes[0])} dims with shape (0 means dynamic): {inshapes[0]}")
|
157
|
+
logger.info(f"Onnx model input has type: {in0type}")
|
158
|
+
onnx_outputs, oshapes = get_and_check_outputs(omodel)
|
159
|
+
logger.info(f"Onnx model output has {len(oshapes[0])} dims with shape (0 means dynamic): {oshapes[0]}")
|
160
|
+
import onnxruntime as ort
|
161
|
+
|
162
|
+
providers = ort.get_available_providers()
|
163
|
+
logger.info(f"ONNX runtime available providers: {providers}.")
|
164
|
+
else:
|
165
|
+
logger.error("No ONNX model provided, exiting.")
|
166
|
+
raise SystemExit(1)
|
167
|
+
|
168
|
+
from os.path import exists
|
169
|
+
|
170
|
+
import h5py
|
171
|
+
|
172
|
+
from sonusai.constants import SAMPLE_RATE
|
173
|
+
from sonusai.mixture import get_audio_from_feature
|
174
|
+
from sonusai.mixture import get_feature_from_audio
|
175
|
+
from sonusai.utils import calc_asr
|
176
|
+
from sonusai.utils import write_audio
|
115
177
|
|
116
178
|
if input_name is not None and exists(input_name):
|
117
179
|
capture_audio = get_frames_from_file(input_name, length)
|
@@ -123,8 +185,12 @@ def main() -> None:
|
|
123
185
|
return
|
124
186
|
# Only write if capture from device, not for file input
|
125
187
|
write_audio(capture_wav, capture_audio, SAMPLE_RATE)
|
126
|
-
logger.
|
127
|
-
logger.
|
188
|
+
logger.debug("")
|
189
|
+
logger.debug(f"Wrote capture audio with shape {capture_audio.shape} to {capture_wav}")
|
190
|
+
|
191
|
+
# Pad audio to transform step size
|
192
|
+
padlen_tf = int(np.ceil(len(capture_audio) / ftr)) * ftr - len(capture_audio)
|
193
|
+
capture_audio = np.pad(capture_audio, (0, padlen_tf), "constant", constant_values=(0, 0))
|
128
194
|
|
129
195
|
if debug:
|
130
196
|
with h5py.File(h5_name, "a") as f:
|
@@ -135,24 +201,16 @@ def main() -> None:
|
|
135
201
|
|
136
202
|
if asr_name is not None:
|
137
203
|
logger.info(f"Running ASR on captured audio with {asr_name} ...")
|
138
|
-
capture_asr = calc_asr(capture_audio, engine=asr_name,
|
139
|
-
logger.info(f"
|
204
|
+
capture_asr = calc_asr(capture_audio, engine=asr_name, model=whisper_name).text
|
205
|
+
logger.info(f"Noisy audio ASR: {capture_asr}")
|
140
206
|
|
141
207
|
if model_name is not None:
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
raise SystemExit(1)
|
146
|
-
feature_mode = hparams["feature"]
|
147
|
-
in0name = sess_inputs[0].name
|
148
|
-
in0type = sess_inputs[0].type
|
149
|
-
out_names = [n.name for n in session.get_outputs()]
|
150
|
-
|
151
|
-
# frames x stride x feat_params
|
152
|
-
feature = get_feature_from_audio(audio=capture_audio, feature_mode=feature_mode)
|
153
|
-
save_figure(capture_png, capture_audio, feature)
|
154
|
-
logger.info(f"Wrote capture plots to {capture_png}")
|
208
|
+
# Pad audio to fill total feature stride * transform stride samples
|
209
|
+
padlen = int(np.ceil(len(capture_audio) / fsamples)) * fsamples - len(capture_audio)
|
210
|
+
capture_audio_p = np.pad(capture_audio, (0, padlen), "constant", constant_values=(0, 0))
|
155
211
|
|
212
|
+
# feature always frames x stride x feat_params, convert to always Batch x Tsteps x Bins
|
213
|
+
feature = get_feature_from_audio(audio=capture_audio_p, feature_mode=feature_mode)
|
156
214
|
if debug:
|
157
215
|
with h5py.File(h5_name, "a") as f:
|
158
216
|
if "feature" in f:
|
@@ -160,25 +218,61 @@ def main() -> None:
|
|
160
218
|
f.create_dataset("feature", data=feature)
|
161
219
|
logger.info(f"Wrote feature with shape {feature.shape} to {h5_name}")
|
162
220
|
|
221
|
+
feat_nov = sov2nov(feature, feat_step) # remove overlap, output always Batch x Tsteps x Bins
|
222
|
+
# TBD remove padding of feature-stride
|
223
|
+
# if padlen > 0:
|
224
|
+
save_figure(capture_png, capture_audio, feat_nov)
|
225
|
+
logger.info(f"Wrote capture plots to {capture_png}")
|
226
|
+
|
227
|
+
if feature_ovr is not None:
|
228
|
+
test_audio = get_audio_from_feature(feature=feat_nov, feature_mode=feature_orig)
|
229
|
+
# write_audio(f'{ts}-noisy-itf.wav', test_audio, SAMPLE_RATE)
|
230
|
+
else:
|
231
|
+
# feature is frames x 1 x Bins, reshape to 1 x frames x Bins for model
|
232
|
+
feature = feature.transpose((1, 0, 2))
|
233
|
+
|
163
234
|
if in0type.find("float16") != -1:
|
164
235
|
logger.info("Detected input of float16, converting all feature inputs to that type.")
|
165
|
-
feature = np.float16(feature) # type: ignore
|
236
|
+
feature = np.float16(feature) # type: ignore
|
166
237
|
|
167
238
|
# Run inference, ort session wants batch x timesteps x feat_params, outputs numpy BxTxFP or BxFP
|
168
239
|
# Note full reshape not needed here since we assume speech enhancement type model, so a transpose suffices
|
169
|
-
|
170
|
-
|
171
|
-
(
|
172
|
-
|
240
|
+
logger.info(f"Running model on data with shape {feature.shape} ...")
|
241
|
+
if feature_ovr is None:
|
242
|
+
predict = session.run(out_names, {in0name: feature})[0] # standard mode (entire batch)
|
243
|
+
else:
|
244
|
+
predict = np.zeros(feature.shape)
|
245
|
+
for i in range(predict.shape[0]):
|
246
|
+
logger.debug(f"running batch: {i}")
|
247
|
+
predict[i, :, :] = session.run(out_names, {in0name: feature[i : i + 1, :, :]})[0]
|
173
248
|
|
174
249
|
if debug:
|
175
250
|
with h5py.File(h5_name, "a") as f:
|
176
251
|
if "predict" in f:
|
177
252
|
del f["predict"]
|
178
253
|
f.create_dataset("predict", data=predict)
|
179
|
-
logger.info(f"Wrote predict with shape {predict.shape} to {h5_name}")
|
254
|
+
logger.info(f"Wrote predict data with shape {predict.shape} to {h5_name}")
|
255
|
+
|
256
|
+
if feature_ovr is not None:
|
257
|
+
predict = sov2nov(predict, feat_step) # always returns batch x tsteps x feat_params
|
258
|
+
predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_orig)
|
259
|
+
else:
|
260
|
+
predict = predict.transpose((1, 0, 2)) # need transpose to frames x 1 x bins
|
261
|
+
predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
|
262
|
+
|
263
|
+
if predict_audio.shape[0] > capture_audio.shape[0]:
|
264
|
+
predict_audio = predict_audio[0 : (capture_audio.shape[0] - predict_audio.shape[0])]
|
265
|
+
|
266
|
+
if predict_audio.shape[0] < capture_audio.shape[0]:
|
267
|
+
capture_audio = capture_audio[0 : (predict_audio.shape[0] - capture_audio.shape[0])]
|
268
|
+
|
269
|
+
if noiseadd is not None:
|
270
|
+
ngain = np.power(10, min(float(noiseadd), 0.0) / 20.0) # limit to gain <1, convert to float
|
271
|
+
if ngain < 1.0: # don't apply if it's 1.0
|
272
|
+
logger.info(f"Adding back noise with gain of {ngain} = {noiseadd} db.")
|
273
|
+
noise = capture_audio - predict_audio
|
274
|
+
predict_audio = predict_audio + ngain * noise
|
180
275
|
|
181
|
-
predict_audio = get_audio_from_feature(feature=predict, feature_mode=feature_mode)
|
182
276
|
write_audio(predict_wav, predict_audio, SAMPLE_RATE)
|
183
277
|
logger.info(f"Wrote predict audio with shape {predict_audio.shape} to {predict_wav}")
|
184
278
|
if debug:
|
@@ -193,9 +287,35 @@ def main() -> None:
|
|
193
287
|
|
194
288
|
if asr_name is not None:
|
195
289
|
logger.info(f"Running ASR on model-enhanced audio with {asr_name} ...")
|
196
|
-
predict_asr = calc_asr(predict_audio, engine=asr_name,
|
290
|
+
predict_asr = calc_asr(predict_audio, engine=asr_name, model=whisper_name).text
|
197
291
|
logger.info(f"Predict audio ASR: {predict_asr}")
|
198
292
|
|
293
|
+
plot_en = True
|
294
|
+
if plot_en is not None:
|
295
|
+
import subprocess
|
296
|
+
|
297
|
+
# Construct plot command using spgramd, start the process non-blocking (will leave matplot open)
|
298
|
+
command = ["python", "spgramd.py", capture_wav, predict_wav]
|
299
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
300
|
+
|
301
|
+
if playback is not None:
|
302
|
+
import sh
|
303
|
+
|
304
|
+
sh.play(capture_wav)
|
305
|
+
sh.play(predict_wav)
|
306
|
+
flag_end = False
|
307
|
+
while not flag_end:
|
308
|
+
choice = input("Press 'r' to replay or 'q' to quit: ").strip().lower()
|
309
|
+
if choice == "q":
|
310
|
+
print("Quitting...")
|
311
|
+
flag_end = True
|
312
|
+
elif choice == "r":
|
313
|
+
print("Replaying...")
|
314
|
+
sh.play(capture_wav)
|
315
|
+
sh.play(predict_wav)
|
316
|
+
else:
|
317
|
+
print("Invalid input. Please try again.")
|
318
|
+
|
199
319
|
|
200
320
|
def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1024) -> AudioT:
|
201
321
|
from select import select
|
@@ -204,8 +324,8 @@ def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1
|
|
204
324
|
import pyaudio
|
205
325
|
|
206
326
|
from sonusai import logger
|
207
|
-
from sonusai.
|
208
|
-
from sonusai.
|
327
|
+
from sonusai.constants import CHANNEL_COUNT
|
328
|
+
from sonusai.constants import SAMPLE_RATE
|
209
329
|
from sonusai.utils import get_input_device_index_by_name
|
210
330
|
from sonusai.utils import get_input_devices
|
211
331
|
|
@@ -220,20 +340,16 @@ def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1
|
|
220
340
|
|
221
341
|
try:
|
222
342
|
device_index = get_input_device_index_by_name(p, input_name)
|
223
|
-
except ValueError as
|
343
|
+
except ValueError as ex:
|
224
344
|
msg = f"Could not find {input_name}\n"
|
225
345
|
msg += "Available devices:\n"
|
226
346
|
for input_device in input_devices:
|
227
347
|
msg += f" {input_device}\n"
|
228
|
-
raise ValueError(msg) from
|
348
|
+
raise ValueError(msg) from ex
|
229
349
|
|
230
350
|
logger.info(f"Capturing from {p.get_device_info_by_index(device_index).get('name')}")
|
231
351
|
stream = p.open(
|
232
|
-
format=pyaudio.paFloat32,
|
233
|
-
channels=CHANNEL_COUNT,
|
234
|
-
rate=SAMPLE_RATE,
|
235
|
-
input=True,
|
236
|
-
input_device_index=device_index,
|
352
|
+
format=pyaudio.paFloat32, channels=CHANNEL_COUNT, rate=SAMPLE_RATE, input=True, input_device_index=device_index
|
237
353
|
)
|
238
354
|
stream.start_stream()
|
239
355
|
|
@@ -269,7 +385,7 @@ def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1
|
|
269
385
|
|
270
386
|
def get_frames_from_file(input_name: str, length: float) -> AudioT:
|
271
387
|
from sonusai import logger
|
272
|
-
from sonusai.
|
388
|
+
from sonusai.constants import SAMPLE_RATE
|
273
389
|
from sonusai.mixture import read_audio
|
274
390
|
|
275
391
|
logger.info(f"Capturing from {input_name}")
|
@@ -281,14 +397,30 @@ def get_frames_from_file(input_name: str, length: float) -> AudioT:
|
|
281
397
|
return frames
|
282
398
|
|
283
399
|
|
400
|
+
def sov2nov(feature: np.ndarray, step: int) -> np.ndarray:
|
401
|
+
"""Convert stride-overlap batch x stride x bins to no overlap frames x 1 x bins"""
|
402
|
+
|
403
|
+
stride = feature.shape[1] # stride, tsteps is set to stride in sov mode
|
404
|
+
if stride == 1:
|
405
|
+
return feature # no reshape if stride is already 1
|
406
|
+
# else:
|
407
|
+
# hs = feature.shape[1]//2 # half of stride
|
408
|
+
# nb = feature.shape[0] # batches
|
409
|
+
|
410
|
+
nb = feature.shape[0]
|
411
|
+
fout = feature[:, (stride - step) :, :] # take last
|
412
|
+
fout = np.reshape(fout, [step * nb, 1, feature.shape[2]])
|
413
|
+
return fout # np.transpose(fout,[1,0,2])
|
414
|
+
|
415
|
+
|
284
416
|
def save_figure(name: str, audio: np.ndarray, feature: np.ndarray) -> None:
|
285
417
|
import matplotlib.pyplot as plt
|
286
418
|
from scipy.interpolate import CubicSpline
|
287
419
|
|
288
|
-
from sonusai.
|
420
|
+
from sonusai.constants import SAMPLE_RATE
|
289
421
|
from sonusai.utils import unstack_complex
|
290
422
|
|
291
|
-
spectrum = 20 * np.log(np.abs(np.squeeze(unstack_complex(feature)).transpose()))
|
423
|
+
spectrum = 20 * np.log(np.abs(np.squeeze(unstack_complex(feature)).transpose()) + 1e-7)
|
292
424
|
frames = spectrum.shape[1]
|
293
425
|
samples = (len(audio) // frames) * frames
|
294
426
|
length_in_s = samples / SAMPLE_RATE
|
@@ -314,4 +446,36 @@ def save_figure(name: str, audio: np.ndarray, feature: np.ndarray) -> None:
|
|
314
446
|
|
315
447
|
|
316
448
|
if __name__ == "__main__":
|
317
|
-
|
449
|
+
from sonusai import exception_handler
|
450
|
+
from sonusai.utils import register_keyboard_interrupt
|
451
|
+
|
452
|
+
register_keyboard_interrupt()
|
453
|
+
try:
|
454
|
+
main()
|
455
|
+
except Exception as e:
|
456
|
+
exception_handler(e)
|
457
|
+
|
458
|
+
|
459
|
+
# import subprocess
|
460
|
+
#
|
461
|
+
# # Define the arguments
|
462
|
+
# arg1 = "value1"
|
463
|
+
# arg2 = "value2"
|
464
|
+
#
|
465
|
+
# # Construct the command
|
466
|
+
# command = ["python", "script.py", arg1, arg2]
|
467
|
+
#
|
468
|
+
# # Start the process
|
469
|
+
# process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
470
|
+
#
|
471
|
+
# # Optionally, you can communicate with the process later if needed
|
472
|
+
# # For example, to wait for the process to finish and get the output
|
473
|
+
# stdout, stderr = process.communicate()
|
474
|
+
#
|
475
|
+
# # Check if the process was successful
|
476
|
+
# if process.returncode == 0:
|
477
|
+
# print("Process executed successfully:")
|
478
|
+
# print(stdout)
|
479
|
+
# else:
|
480
|
+
# print("Process failed:")
|
481
|
+
# print(stderr)
|