sonusai 0.18.5__py3-none-any.whl → 0.18.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +6 -0
- sonusai/genmetrics.py +4 -4
- sonusai/metrics/__init__.py +2 -1
- sonusai/metrics/calc_audio_stats.py +9 -1
- sonusai/metrics/calc_segsnr_f.py +84 -0
- sonusai/metrics/calc_speech.py +5 -5
- sonusai/mixture/__init__.py +3 -0
- sonusai/mixture/datatypes.py +65 -6
- sonusai/mixture/feature.py +4 -19
- sonusai/mixture/helpers.py +47 -38
- sonusai/mixture/mixdb.py +147 -57
- sonusai/mixture/sox_audio.py +125 -0
- sonusai/mixture/truth_functions/data.py +23 -22
- sonusai/mixture/truth_functions/energy.py +3 -1
- sonusai/mixture/truth_functions/sed.py +2 -1
- sonusai/mixture/truth_functions/target.py +3 -4
- sonusai/post_spenh_targetf.py +7 -7
- sonusai/utils/__init__.py +2 -0
- sonusai/utils/compress.py +25 -0
- sonusai/utils/energy_f.py +3 -4
- {sonusai-0.18.5.dist-info → sonusai-0.18.7.dist-info}/METADATA +1 -1
- {sonusai-0.18.5.dist-info → sonusai-0.18.7.dist-info}/RECORD +24 -23
- sonusai/metrics/calc_snr_f.py +0 -34
- {sonusai-0.18.5.dist-info → sonusai-0.18.7.dist-info}/WHEEL +0 -0
- {sonusai-0.18.5.dist-info → sonusai-0.18.7.dist-info}/entry_points.txt +0 -0
sonusai/mixture/mixdb.py
CHANGED
@@ -17,6 +17,8 @@ from sonusai.mixture.datatypes import FeatureGeneratorConfig
|
|
17
17
|
from sonusai.mixture.datatypes import FeatureGeneratorInfo
|
18
18
|
from sonusai.mixture.datatypes import GeneralizedIDs
|
19
19
|
from sonusai.mixture.datatypes import ImpulseResponseFiles
|
20
|
+
from sonusai.mixture.datatypes import MetricDoc
|
21
|
+
from sonusai.mixture.datatypes import MetricDocs
|
20
22
|
from sonusai.mixture.datatypes import Mixture
|
21
23
|
from sonusai.mixture.datatypes import Mixtures
|
22
24
|
from sonusai.mixture.datatypes import NoiseFile
|
@@ -155,19 +157,65 @@ class MixtureDatabase:
|
|
155
157
|
return json.loads(c.execute("SELECT top.asr_configs FROM top").fetchone()[0])
|
156
158
|
|
157
159
|
@cached_property
|
158
|
-
def supported_metrics(self) ->
|
159
|
-
metrics =
|
160
|
-
'
|
161
|
-
'
|
162
|
-
'
|
163
|
-
'
|
164
|
-
|
165
|
-
'
|
166
|
-
|
167
|
-
'
|
168
|
-
|
160
|
+
def supported_metrics(self) -> MetricDocs:
|
161
|
+
metrics = MetricDocs([
|
162
|
+
MetricDoc('Mixture Metrics', 'mxsnr', 'SNR specification in dB'),
|
163
|
+
MetricDoc('Mixture Metrics', 'mxssnr_avg', 'Segmental SNR average over all frames'),
|
164
|
+
MetricDoc('Mixture Metrics', 'mxssnr_std', 'Segmental SNR standard deviation over all frames'),
|
165
|
+
MetricDoc('Mixture Metrics', 'mxssnrdb_avg',
|
166
|
+
'Segmental SNR average of the dB frame values over all frames'),
|
167
|
+
MetricDoc('Mixture Metrics', 'mxssnrdb_std',
|
168
|
+
'Segmental SNR standard deviation of the dB frame values over all frames'),
|
169
|
+
MetricDoc('Mixture Metrics', 'mxssnrf_avg',
|
170
|
+
'Per-bin segmental SNR average over all frames (using feature transform)'),
|
171
|
+
MetricDoc('Mixture Metrics', 'mxssnrf_std',
|
172
|
+
'Per-bin segmental SNR standard deviation over all frames (using feature transform)'),
|
173
|
+
MetricDoc('Mixture Metrics', 'mxssnrdbf_avg',
|
174
|
+
'Per-bin segmental average of the dB frame values over all frames (using feature transform)'),
|
175
|
+
MetricDoc('Mixture Metrics', 'mxssnrdbf_std',
|
176
|
+
'Per-bin segmental standard deviation of the dB frame values over all frames (using feature transform)'),
|
177
|
+
MetricDoc('Mixture Metrics', 'mxpesq', 'PESQ of mixture versus true target[0]'),
|
178
|
+
MetricDoc('Mixture Metrics', 'mxwsdr', 'Weighted signal distorion ratio of mixture versus true target[0]'),
|
179
|
+
MetricDoc('Mixture Metrics', 'mxpd', 'Phase distance between mixture and true target[0]'),
|
180
|
+
MetricDoc('Mixture Metrics', 'mxstoi',
|
181
|
+
'Short term objective intelligibility of mixture versus true target[0]'),
|
182
|
+
MetricDoc('Mixture Metrics', 'mxcsig',
|
183
|
+
'Predicted rating of speech distortion of mixture versus true target[0]'),
|
184
|
+
MetricDoc('Mixture Metrics', 'mxcbak',
|
185
|
+
'Predicted rating of background distortion of mixture versus true target[0]'),
|
186
|
+
MetricDoc('Mixture Metrics', 'mxcovl',
|
187
|
+
'Predicted rating of overall quality of mixture versus true target[0]'),
|
188
|
+
MetricDoc('Mixture Metrics', 'ssnr', 'Segmental SNR'),
|
189
|
+
MetricDoc('Target Metrics', 'tdco', 'Target[0] DC offset'),
|
190
|
+
MetricDoc('Target Metrics', 'tmin', 'Target[0] min level'),
|
191
|
+
MetricDoc('Target Metrics', 'tmax', 'Target[0] max levl'),
|
192
|
+
MetricDoc('Target Metrics', 'tpkdb', 'Target[0] Pk lev dB'),
|
193
|
+
MetricDoc('Target Metrics', 'tlrms', 'Target[0] RMS lev dB'),
|
194
|
+
MetricDoc('Target Metrics', 'tpkr', 'Target[0] RMS Pk dB'),
|
195
|
+
MetricDoc('Target Metrics', 'ttr', 'Target[0] RMS Tr dB'),
|
196
|
+
MetricDoc('Target Metrics', 'tcr', 'Target[0] Crest factor'),
|
197
|
+
MetricDoc('Target Metrics', 'tfl', 'Target[0] Flat factor'),
|
198
|
+
MetricDoc('Target Metrics', 'tpkc', 'Target[0] Pk count'),
|
199
|
+
MetricDoc('Noise Metrics', 'ndco', 'Noise DC offset'),
|
200
|
+
MetricDoc('Noise Metrics', 'nmin', 'Noise min level'),
|
201
|
+
MetricDoc('Noise Metrics', 'nmax', 'Noise max levl'),
|
202
|
+
MetricDoc('Noise Metrics', 'npkdb', 'Noise Pk lev dB'),
|
203
|
+
MetricDoc('Noise Metrics', 'nlrms', 'Noise RMS lev dB'),
|
204
|
+
MetricDoc('Noise Metrics', 'npkr', 'Noise RMS Pk dB'),
|
205
|
+
MetricDoc('Noise Metrics', 'ntr', 'Noise RMS Tr dB'),
|
206
|
+
MetricDoc('Noise Metrics', 'ncr', 'Noise Crest factor'),
|
207
|
+
MetricDoc('Noise Metrics', 'nfl', 'Noise Flat factor'),
|
208
|
+
MetricDoc('Noise Metrics', 'npkc', 'Noise Pk count'),
|
209
|
+
MetricDoc('Truth Metrics', 'sedavg',
|
210
|
+
'(not implemented) Average SED activity over all frames [num_classes, 1]'),
|
211
|
+
MetricDoc('Truth Metrics', 'sedcnt',
|
212
|
+
'(not implemented) Count in number of frames that SED is active [num_classes, 1]'),
|
213
|
+
MetricDoc('Truth Metrics', 'sedtop3', '(not implemented) 3 most active by largest sedavg [3, 1]'),
|
214
|
+
MetricDoc('Truth Metrics', 'sedtopn', '(not implemented) N most active by largest sedavg [N, 1]'),
|
215
|
+
])
|
169
216
|
for name in self.asr_configs:
|
170
|
-
metrics.
|
217
|
+
metrics.append(MetricDoc('Mixture Metrics', f'mxwer.{name}',
|
218
|
+
f'Word error rate using {name} ASR as defined in mixdb asr_configs parameter'))
|
171
219
|
|
172
220
|
return metrics
|
173
221
|
|
@@ -240,11 +288,15 @@ class MixtureDatabase:
|
|
240
288
|
def total_feature_frames(self, m_ids: GeneralizedIDs = '*') -> int:
|
241
289
|
return self.total_samples(m_ids) // self.feature_step_samples
|
242
290
|
|
243
|
-
def mixture_transform_frames(self,
|
244
|
-
|
291
|
+
def mixture_transform_frames(self, m_id: int) -> int:
|
292
|
+
from .helpers import frames_from_samples
|
245
293
|
|
246
|
-
|
247
|
-
|
294
|
+
return frames_from_samples(self.mixture(m_id).samples, self.ft_config.R)
|
295
|
+
|
296
|
+
def mixture_feature_frames(self, m_id: int) -> int:
|
297
|
+
from .helpers import frames_from_samples
|
298
|
+
|
299
|
+
return frames_from_samples(self.mixture(m_id).samples, self.feature_step_samples)
|
248
300
|
|
249
301
|
def mixids_to_list(self, m_ids: Optional[GeneralizedIDs] = None) -> list[int]:
|
250
302
|
"""Resolve generalized mixture IDs to a list of integers
|
@@ -907,8 +959,8 @@ class MixtureDatabase:
|
|
907
959
|
truth_t = self.mixture_truth_t(m_id=m_id, targets=targets, noise=noise, force=force)
|
908
960
|
|
909
961
|
m = self.mixture(m_id)
|
910
|
-
transform_frames = self.mixture_transform_frames(
|
911
|
-
feature_frames = self.mixture_feature_frames(
|
962
|
+
transform_frames = self.mixture_transform_frames(m_id)
|
963
|
+
feature_frames = self.mixture_feature_frames(m_id)
|
912
964
|
|
913
965
|
if truth_t is None:
|
914
966
|
truth_t = np.zeros((m.samples, self.num_classes), dtype=np.float32)
|
@@ -1063,25 +1115,22 @@ class MixtureDatabase:
|
|
1063
1115
|
if is_textgrid:
|
1064
1116
|
for target in self.mixture(mixid).targets:
|
1065
1117
|
data = get_textgrid_tier_from_target_file(self.target_file(target.file_id).name, tier)
|
1066
|
-
if data
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
results.append(data)
|
1118
|
+
if isinstance(data, list):
|
1119
|
+
# Check for tempo augmentation and adjust Interval start and end data as needed
|
1120
|
+
entries = []
|
1121
|
+
for entry in data:
|
1122
|
+
if target.augmentation.tempo is not None:
|
1123
|
+
entries.append(Interval(entry.start / target.augmentation.tempo,
|
1124
|
+
entry.end / target.augmentation.tempo,
|
1125
|
+
entry.label))
|
1126
|
+
else:
|
1127
|
+
entries.append(entry)
|
1128
|
+
results.append(entries)
|
1129
|
+
else:
|
1130
|
+
results.append(data)
|
1080
1131
|
else:
|
1081
1132
|
for target in self.mixture(mixid).targets:
|
1082
|
-
|
1083
|
-
if data is not None:
|
1084
|
-
results.append(data)
|
1133
|
+
results.append(self.speaker(self.target_file(target.file_id).speaker_id, tier))
|
1085
1134
|
|
1086
1135
|
return sorted(results)
|
1087
1136
|
|
@@ -1152,7 +1201,8 @@ class MixtureDatabase:
|
|
1152
1201
|
from sonusai import SonusAIError
|
1153
1202
|
from sonusai.metrics import calc_audio_stats
|
1154
1203
|
from sonusai.metrics import calc_phase_distance
|
1155
|
-
from sonusai.metrics import
|
1204
|
+
from sonusai.metrics import calc_segsnr_f
|
1205
|
+
from sonusai.metrics import calc_segsnr_f_bin
|
1156
1206
|
from sonusai.metrics import calc_speech
|
1157
1207
|
from sonusai.metrics import calc_wer
|
1158
1208
|
from sonusai.metrics import calc_wsdr
|
@@ -1161,7 +1211,7 @@ class MixtureDatabase:
|
|
1161
1211
|
from sonusai.mixture import SpeechMetrics
|
1162
1212
|
from sonusai.utils import calc_asr
|
1163
1213
|
|
1164
|
-
def create_target_audio() -> Callable:
|
1214
|
+
def create_target_audio() -> Callable[[], np.ndarray]:
|
1165
1215
|
state = None
|
1166
1216
|
|
1167
1217
|
def get() -> np.ndarray:
|
@@ -1174,7 +1224,20 @@ class MixtureDatabase:
|
|
1174
1224
|
|
1175
1225
|
target_audio = create_target_audio()
|
1176
1226
|
|
1177
|
-
def
|
1227
|
+
def create_target_f() -> Callable[[], np.ndarray]:
|
1228
|
+
state = None
|
1229
|
+
|
1230
|
+
def get() -> np.ndarray:
|
1231
|
+
nonlocal state
|
1232
|
+
if state is None:
|
1233
|
+
state = self.mixture_targets_f(m_id)[0]
|
1234
|
+
return state
|
1235
|
+
|
1236
|
+
return get
|
1237
|
+
|
1238
|
+
target_f = create_target_f()
|
1239
|
+
|
1240
|
+
def create_noise_audio() -> Callable[[], np.ndarray]:
|
1178
1241
|
state = None
|
1179
1242
|
|
1180
1243
|
def get() -> np.ndarray:
|
@@ -1187,7 +1250,20 @@ class MixtureDatabase:
|
|
1187
1250
|
|
1188
1251
|
noise_audio = create_noise_audio()
|
1189
1252
|
|
1190
|
-
def
|
1253
|
+
def create_noise_f() -> Callable[[], np.ndarray]:
|
1254
|
+
state = None
|
1255
|
+
|
1256
|
+
def get() -> np.ndarray:
|
1257
|
+
nonlocal state
|
1258
|
+
if state is None:
|
1259
|
+
state = self.mixture_noise_f(m_id)
|
1260
|
+
return state
|
1261
|
+
|
1262
|
+
return get
|
1263
|
+
|
1264
|
+
noise_f = create_noise_f()
|
1265
|
+
|
1266
|
+
def create_mixture_audio() -> Callable[[], np.ndarray]:
|
1191
1267
|
state = None
|
1192
1268
|
|
1193
1269
|
def get() -> np.ndarray:
|
@@ -1200,7 +1276,7 @@ class MixtureDatabase:
|
|
1200
1276
|
|
1201
1277
|
mixture_audio = create_mixture_audio()
|
1202
1278
|
|
1203
|
-
def create_segsnr_f() -> Callable:
|
1279
|
+
def create_segsnr_f() -> Callable[[], np.ndarray]:
|
1204
1280
|
state = None
|
1205
1281
|
|
1206
1282
|
def get() -> np.ndarray:
|
@@ -1213,7 +1289,7 @@ class MixtureDatabase:
|
|
1213
1289
|
|
1214
1290
|
segsnr_f = create_segsnr_f()
|
1215
1291
|
|
1216
|
-
def create_speech() -> Callable:
|
1292
|
+
def create_speech() -> Callable[[], SpeechMetrics]:
|
1217
1293
|
state = None
|
1218
1294
|
|
1219
1295
|
def get() -> SpeechMetrics:
|
@@ -1226,7 +1302,7 @@ class MixtureDatabase:
|
|
1226
1302
|
|
1227
1303
|
speech = create_speech()
|
1228
1304
|
|
1229
|
-
def create_target_stats() -> Callable:
|
1305
|
+
def create_target_stats() -> Callable[[], AudioStatsMetrics]:
|
1230
1306
|
state = None
|
1231
1307
|
|
1232
1308
|
def get() -> AudioStatsMetrics:
|
@@ -1239,7 +1315,7 @@ class MixtureDatabase:
|
|
1239
1315
|
|
1240
1316
|
target_stats = create_target_stats()
|
1241
1317
|
|
1242
|
-
def create_noise_stats() -> Callable:
|
1318
|
+
def create_noise_stats() -> Callable[[], AudioStatsMetrics]:
|
1243
1319
|
state = None
|
1244
1320
|
|
1245
1321
|
def get() -> AudioStatsMetrics:
|
@@ -1289,17 +1365,29 @@ class MixtureDatabase:
|
|
1289
1365
|
# TODO: should this be NaN like above?
|
1290
1366
|
return float(0)
|
1291
1367
|
|
1292
|
-
if m == '
|
1293
|
-
return
|
1368
|
+
if m == 'mxssnr_avg':
|
1369
|
+
return calc_segsnr_f(segsnr_f()).avg
|
1370
|
+
|
1371
|
+
if m == 'mxssnr_std':
|
1372
|
+
return calc_segsnr_f(segsnr_f()).std
|
1294
1373
|
|
1295
|
-
if m == '
|
1296
|
-
return
|
1374
|
+
if m == 'mxssnrdb_avg':
|
1375
|
+
return calc_segsnr_f(segsnr_f()).db_avg
|
1297
1376
|
|
1298
|
-
if m == '
|
1299
|
-
return
|
1377
|
+
if m == 'mxssnrdb_std':
|
1378
|
+
return calc_segsnr_f(segsnr_f()).db_std
|
1300
1379
|
|
1301
|
-
if m == '
|
1302
|
-
return
|
1380
|
+
if m == 'mxssnrf_avg':
|
1381
|
+
return calc_segsnr_f_bin(target_f(), noise_f()).avg
|
1382
|
+
|
1383
|
+
if m == 'mxssnrf_std':
|
1384
|
+
return calc_segsnr_f_bin(target_f(), noise_f()).std
|
1385
|
+
|
1386
|
+
if m == 'mxssnrdbf_avg':
|
1387
|
+
return calc_segsnr_f_bin(target_f(), noise_f()).db_avg
|
1388
|
+
|
1389
|
+
if m == 'mxssnrdbf_std':
|
1390
|
+
return calc_segsnr_f_bin(target_f(), noise_f()).db_std
|
1303
1391
|
|
1304
1392
|
if m == 'mxpesq':
|
1305
1393
|
if self.mixture(m_id).snr < -96:
|
@@ -1309,17 +1397,17 @@ class MixtureDatabase:
|
|
1309
1397
|
if m == 'mxcsig':
|
1310
1398
|
if self.mixture(m_id).snr < -96:
|
1311
1399
|
return 0
|
1312
|
-
return speech().
|
1400
|
+
return speech().csig
|
1313
1401
|
|
1314
1402
|
if m == 'mxcbak':
|
1315
1403
|
if self.mixture(m_id).snr < -96:
|
1316
1404
|
return 0
|
1317
|
-
return speech().
|
1405
|
+
return speech().cbak
|
1318
1406
|
|
1319
1407
|
if m == 'mxcovl':
|
1320
1408
|
if self.mixture(m_id).snr < -96:
|
1321
1409
|
return 0
|
1322
|
-
return speech().
|
1410
|
+
return speech().covl
|
1323
1411
|
|
1324
1412
|
if m == 'mxwsdr':
|
1325
1413
|
mixture = mixture_audio()[:, np.newaxis]
|
@@ -1331,8 +1419,7 @@ class MixtureDatabase:
|
|
1331
1419
|
|
1332
1420
|
if m == 'mxpd':
|
1333
1421
|
mixture_f = self.mixture_mixture_f(m_id)
|
1334
|
-
|
1335
|
-
return calc_phase_distance(hypothesis=mixture_f, reference=target_f)[0]
|
1422
|
+
return calc_phase_distance(hypothesis=mixture_f, reference=target_f())[0]
|
1336
1423
|
|
1337
1424
|
if m == 'mxstoi':
|
1338
1425
|
return stoi(x=target_audio(), y=mixture_audio(), fs_sig=SAMPLE_RATE, extended=False)
|
@@ -1403,11 +1490,14 @@ class MixtureDatabase:
|
|
1403
1490
|
if m == 'sedcnt':
|
1404
1491
|
return 0
|
1405
1492
|
|
1493
|
+
if m == 'sedtop3':
|
1494
|
+
return np.zeros(3, dtype=np.float32)
|
1495
|
+
|
1406
1496
|
if m == 'sedtopn':
|
1407
1497
|
return 0
|
1408
1498
|
|
1409
1499
|
if m == 'ssnr':
|
1410
|
-
return
|
1500
|
+
return segsnr_f()
|
1411
1501
|
|
1412
1502
|
raise SonusAIError(f"Unrecognized metric: '{m}'")
|
1413
1503
|
|
sonusai/mixture/sox_audio.py
CHANGED
@@ -210,6 +210,131 @@ class Transformer(SoxTransformer):
|
|
210
210
|
|
211
211
|
return self
|
212
212
|
|
213
|
+
def build(self,
|
214
|
+
input_filepath: Optional[str | Path] = None,
|
215
|
+
output_filepath: Optional[str | Path] = None,
|
216
|
+
input_array: Optional[np.ndarray] = None,
|
217
|
+
sample_rate_in: Optional[float] = None,
|
218
|
+
extra_args: Optional[list[str]] = None,
|
219
|
+
return_output: bool = False) -> tuple[bool, Optional[str], Optional[str]]:
|
220
|
+
"""Given an input file or array, creates an output_file on disk by
|
221
|
+
executing the current set of commands. This function returns True on
|
222
|
+
success. If return_output is True, this function returns a triple of
|
223
|
+
(status, out, err), giving the success state, along with stdout and
|
224
|
+
stderr returned by sox.
|
225
|
+
|
226
|
+
Parameters
|
227
|
+
----------
|
228
|
+
input_filepath : str or None
|
229
|
+
Either path to input audio file or None for array input.
|
230
|
+
output_filepath : str
|
231
|
+
Path to desired output file. If a file already exists at
|
232
|
+
the given path, the file will be overwritten.
|
233
|
+
If '-n', no file is created.
|
234
|
+
input_array : np.ndarray or None
|
235
|
+
An np.ndarray of an waveform with shape (n_samples, n_channels).
|
236
|
+
sample_rate_in must also be provided.
|
237
|
+
If None, input_filepath must be specified.
|
238
|
+
sample_rate_in : int
|
239
|
+
Sample rate of input_array.
|
240
|
+
This argument is ignored if input_array is None.
|
241
|
+
extra_args : list or None, default=None
|
242
|
+
If a list is given, these additional arguments are passed to SoX
|
243
|
+
at the end of the list of effects.
|
244
|
+
Don't use this argument unless you know exactly what you're doing!
|
245
|
+
return_output : bool, default=False
|
246
|
+
If True, returns the status and information sent to stderr and
|
247
|
+
stdout as a tuple (status, stdout, stderr).
|
248
|
+
If output_filepath is None, return_output=True by default.
|
249
|
+
If False, returns True on success.
|
250
|
+
|
251
|
+
Returns
|
252
|
+
-------
|
253
|
+
status : bool
|
254
|
+
True on success.
|
255
|
+
out : str (optional)
|
256
|
+
This is not returned unless return_output is True.
|
257
|
+
When returned, captures the stdout produced by sox.
|
258
|
+
err : str (optional)
|
259
|
+
This is not returned unless return_output is True.
|
260
|
+
When returned, captures the stderr produced by sox.
|
261
|
+
|
262
|
+
Examples
|
263
|
+
--------
|
264
|
+
> import numpy as np
|
265
|
+
> import sox
|
266
|
+
> tfm = sox.Transformer()
|
267
|
+
> sample_rate = 44100
|
268
|
+
> y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
|
269
|
+
|
270
|
+
file in, file out - basic usage
|
271
|
+
|
272
|
+
> status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
|
273
|
+
|
274
|
+
file in, file out - equivalent usage
|
275
|
+
|
276
|
+
> status = tfm.build(
|
277
|
+
input_filepath='path/to/input.wav',
|
278
|
+
output_filepath='path/to/output.mp3'
|
279
|
+
)
|
280
|
+
|
281
|
+
array in, file out
|
282
|
+
|
283
|
+
> status = tfm.build(
|
284
|
+
input_array=y, sample_rate_in=sample_rate,
|
285
|
+
output_filepath='path/to/output.mp3'
|
286
|
+
)
|
287
|
+
|
288
|
+
"""
|
289
|
+
from sox import file_info
|
290
|
+
from sox.core import SoxError
|
291
|
+
from sox.core import sox
|
292
|
+
from sox.log import logger
|
293
|
+
|
294
|
+
input_format, input_filepath = self._parse_inputs(
|
295
|
+
input_filepath, input_array, sample_rate_in
|
296
|
+
)
|
297
|
+
|
298
|
+
if output_filepath is None:
|
299
|
+
raise ValueError("output_filepath is not specified!")
|
300
|
+
|
301
|
+
# set output parameters
|
302
|
+
if input_filepath == output_filepath:
|
303
|
+
raise ValueError(
|
304
|
+
"input_filepath must be different from output_filepath."
|
305
|
+
)
|
306
|
+
file_info.validate_output_file(output_filepath)
|
307
|
+
|
308
|
+
args = []
|
309
|
+
args.extend(self.globals)
|
310
|
+
args.extend(self._input_format_args(input_format))
|
311
|
+
args.append(input_filepath)
|
312
|
+
args.extend(self._output_format_args(self.output_format))
|
313
|
+
args.append(output_filepath)
|
314
|
+
args.extend(self.effects)
|
315
|
+
|
316
|
+
if extra_args is not None:
|
317
|
+
if not isinstance(extra_args, list):
|
318
|
+
raise ValueError("extra_args must be a list.")
|
319
|
+
args.extend(extra_args)
|
320
|
+
|
321
|
+
status, out, err = sox(args, input_array, True)
|
322
|
+
if status != 0:
|
323
|
+
raise SoxError(
|
324
|
+
f"Stdout: {out}\nStderr: {err}"
|
325
|
+
)
|
326
|
+
|
327
|
+
logger.info(
|
328
|
+
"Created %s with effects: %s",
|
329
|
+
output_filepath,
|
330
|
+
" ".join(self.effects_log)
|
331
|
+
)
|
332
|
+
|
333
|
+
if return_output:
|
334
|
+
return status, out, err
|
335
|
+
|
336
|
+
return True, None, None
|
337
|
+
|
213
338
|
def build_array(self,
|
214
339
|
input_filepath: Optional[str | Path] = None,
|
215
340
|
input_array: Optional[np.ndarray] = None,
|
@@ -3,13 +3,14 @@ from sonusai.mixture.datatypes import TruthFunctionConfig
|
|
3
3
|
|
4
4
|
|
5
5
|
class Data:
|
6
|
-
def __init__(self,
|
6
|
+
def __init__(self,
|
7
|
+
target_audio: AudioT,
|
7
8
|
noise_audio: AudioT,
|
8
9
|
mixture_audio: AudioT,
|
9
10
|
config: TruthFunctionConfig) -> None:
|
10
11
|
import numpy as np
|
11
|
-
from
|
12
|
-
from
|
12
|
+
from sonusai import ForwardTransform
|
13
|
+
from sonusai import InverseTransform
|
13
14
|
from pyaaware import FeatureGenerator
|
14
15
|
|
15
16
|
from sonusai import SonusAIError
|
@@ -33,25 +34,25 @@ class Data:
|
|
33
34
|
|
34
35
|
self.offsets = range(0, len(target_audio), self.frame_size)
|
35
36
|
self.zero_based_indices = [x - 1 for x in config.index]
|
36
|
-
self.target_fft =
|
37
|
-
|
38
|
-
bin_start=fg.bin_start,
|
39
|
-
bin_end=fg.bin_end,
|
40
|
-
ttype=fg.ftransform_ttype)
|
41
|
-
self.noise_fft = AawareForwardTransform(N=fg.ftransform_N,
|
42
|
-
R=fg.ftransform_R,
|
43
|
-
bin_start=fg.bin_start,
|
44
|
-
bin_end=fg.bin_end,
|
45
|
-
ttype=fg.ftransform_ttype)
|
46
|
-
self.mixture_fft = AawareForwardTransform(N=fg.ftransform_N,
|
47
|
-
R=fg.ftransform_R,
|
48
|
-
bin_start=fg.bin_start,
|
49
|
-
bin_end=fg.bin_end,
|
50
|
-
ttype=fg.ftransform_ttype)
|
51
|
-
self.swin = AawareInverseTransform(N=fg.itransform_N,
|
52
|
-
R=fg.itransform_R,
|
37
|
+
self.target_fft = ForwardTransform(N=fg.ftransform_N,
|
38
|
+
R=fg.ftransform_R,
|
53
39
|
bin_start=fg.bin_start,
|
54
40
|
bin_end=fg.bin_end,
|
55
|
-
ttype=fg.
|
56
|
-
|
41
|
+
ttype=fg.ftransform_ttype)
|
42
|
+
self.noise_fft = ForwardTransform(N=fg.ftransform_N,
|
43
|
+
R=fg.ftransform_R,
|
44
|
+
bin_start=fg.bin_start,
|
45
|
+
bin_end=fg.bin_end,
|
46
|
+
ttype=fg.ftransform_ttype)
|
47
|
+
self.mixture_fft = ForwardTransform(N=fg.ftransform_N,
|
48
|
+
R=fg.ftransform_R,
|
49
|
+
bin_start=fg.bin_start,
|
50
|
+
bin_end=fg.bin_end,
|
51
|
+
ttype=fg.ftransform_ttype)
|
52
|
+
self.swin = InverseTransform(N=fg.itransform_N,
|
53
|
+
R=fg.itransform_R,
|
54
|
+
bin_start=fg.bin_start,
|
55
|
+
bin_end=fg.bin_end,
|
56
|
+
ttype=fg.itransform_ttype,
|
57
|
+
gain=np.float32(1)).W
|
57
58
|
self.truth = np.zeros((len(target_audio), config.num_classes), dtype=np.float32)
|
@@ -132,9 +132,11 @@ def energy_t(data: Data) -> Truth:
|
|
132
132
|
will reflect the total energy over all bins regardless of the feature
|
133
133
|
transform config.
|
134
134
|
"""
|
135
|
+
import torch
|
136
|
+
|
135
137
|
from sonusai import SonusAIError
|
136
138
|
|
137
|
-
|
139
|
+
target_energy = data.target_fft.execute_all(torch.from_numpy(data.target_audio))[1].numpy()
|
138
140
|
if len(target_energy) != len(data.offsets):
|
139
141
|
raise SonusAIError(f'Number of frames in target_energy, {len(target_energy)},'
|
140
142
|
f' is not number of frames in truth, {len(data.offsets)}')
|
@@ -21,6 +21,7 @@ should be set to the number of sounds/classes to be detected + 1 for
|
|
21
21
|
the other class.
|
22
22
|
"""
|
23
23
|
import numpy as np
|
24
|
+
import torch
|
24
25
|
from pyaaware import SED
|
25
26
|
|
26
27
|
from sonusai import SonusAIError
|
@@ -48,7 +49,7 @@ the other class.
|
|
48
49
|
mutex=data.config.mutex)
|
49
50
|
|
50
51
|
target_audio = data.target_audio / data.config.target_gain
|
51
|
-
|
52
|
+
energy_t = data.target_fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
|
52
53
|
if len(energy_t) != len(data.offsets):
|
53
54
|
raise SonusAIError(f'Number of frames in energy_t, {len(energy_t)},'
|
54
55
|
f' is not number of frames in truth, {len(data.offsets)}')
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from
|
1
|
+
from sonusai import ForwardTransform
|
2
2
|
|
3
3
|
from sonusai.mixture.datatypes import AudioF
|
4
4
|
from sonusai.mixture.datatypes import AudioT
|
@@ -98,7 +98,6 @@ Output shape: [:, 2 * bins] (stacked real, imag)
|
|
98
98
|
for idx, offset in enumerate(data.offsets):
|
99
99
|
target_freq, _ = data.target_fft.execute(
|
100
100
|
np.multiply(data.target_audio[offset:offset + data.frame_size], data.swin))
|
101
|
-
target_freq = target_freq.transpose()
|
102
101
|
|
103
102
|
indices = slice(offset, offset + data.frame_size)
|
104
103
|
for index in data.zero_based_indices:
|
@@ -112,10 +111,10 @@ Output shape: [:, 2 * bins] (stacked real, imag)
|
|
112
111
|
|
113
112
|
|
114
113
|
def _execute_fft(audio: AudioT, transform: ForwardTransform, expected_frames: int) -> AudioF:
|
114
|
+
import torch
|
115
115
|
from sonusai import SonusAIError
|
116
116
|
|
117
|
-
freq
|
118
|
-
freq = freq.transpose()
|
117
|
+
freq = transform.execute_all(torch.from_numpy(audio))[0].numpy()
|
119
118
|
if len(freq) != expected_frames:
|
120
119
|
raise SonusAIError(f'Number of frames, {len(freq)}, is not number of frames expected, {expected_frames}')
|
121
120
|
return freq
|
sonusai/post_spenh_targetf.py
CHANGED
@@ -131,7 +131,7 @@ def _process(file: str) -> None:
|
|
131
131
|
|
132
132
|
import h5py
|
133
133
|
import numpy as np
|
134
|
-
from
|
134
|
+
from sonusai import InverseTransform
|
135
135
|
|
136
136
|
from sonusai import SonusAIError
|
137
137
|
from sonusai.mixture import get_audio_from_transform
|
@@ -147,12 +147,12 @@ def _process(file: str) -> None:
|
|
147
147
|
|
148
148
|
output_name = join(MP_GLOBAL.output_dir, splitext(basename(file))[0] + '.wav')
|
149
149
|
audio, _ = get_audio_from_transform(data=predict,
|
150
|
-
transform=
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
150
|
+
transform=InverseTransform(N=MP_GLOBAL.N,
|
151
|
+
R=MP_GLOBAL.R,
|
152
|
+
bin_start=MP_GLOBAL.bin_start,
|
153
|
+
bin_end=MP_GLOBAL.bin_end,
|
154
|
+
ttype=MP_GLOBAL.ttype,
|
155
|
+
gain=np.float32(1)))
|
156
156
|
write_audio(name=output_name, audio=float_to_int16(audio))
|
157
157
|
|
158
158
|
|
sonusai/utils/__init__.py
CHANGED
@@ -9,6 +9,8 @@ from .audio_devices import get_input_devices
|
|
9
9
|
from .braced_glob import braced_glob
|
10
10
|
from .braced_glob import braced_iglob
|
11
11
|
from .calculate_input_shape import calculate_input_shape
|
12
|
+
from .compress import power_compress
|
13
|
+
from .compress import power_uncompress
|
12
14
|
from .convert_string_to_number import convert_string_to_number
|
13
15
|
from .create_timestamp import create_timestamp
|
14
16
|
from .create_ts_name import create_ts_name
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from sonusai.mixture import AudioF
|
2
|
+
|
3
|
+
|
4
|
+
def power_compress(feature: AudioF) -> AudioF:
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
mag = np.abs(feature)
|
8
|
+
phase = np.angle(feature)
|
9
|
+
mag = mag ** 0.3
|
10
|
+
real_compress = mag * np.cos(phase)
|
11
|
+
imag_compress = mag * np.sin(phase)
|
12
|
+
|
13
|
+
return real_compress + 1j * imag_compress
|
14
|
+
|
15
|
+
|
16
|
+
def power_uncompress(feature: AudioF) -> AudioF:
|
17
|
+
import numpy as np
|
18
|
+
|
19
|
+
mag = np.abs(feature)
|
20
|
+
phase = np.angle(feature)
|
21
|
+
mag = mag ** (1. / 0.3)
|
22
|
+
real_uncompress = mag * np.cos(phase)
|
23
|
+
imag_uncompress = mag * np.sin(phase)
|
24
|
+
|
25
|
+
return real_uncompress + 1j * imag_uncompress
|