sonusai 0.18.6__py3-none-any.whl → 0.18.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonusai/mixture/mixdb.py CHANGED
@@ -17,6 +17,8 @@ from sonusai.mixture.datatypes import FeatureGeneratorConfig
17
17
  from sonusai.mixture.datatypes import FeatureGeneratorInfo
18
18
  from sonusai.mixture.datatypes import GeneralizedIDs
19
19
  from sonusai.mixture.datatypes import ImpulseResponseFiles
20
+ from sonusai.mixture.datatypes import MetricDoc
21
+ from sonusai.mixture.datatypes import MetricDocs
20
22
  from sonusai.mixture.datatypes import Mixture
21
23
  from sonusai.mixture.datatypes import Mixtures
22
24
  from sonusai.mixture.datatypes import NoiseFile
@@ -155,19 +157,65 @@ class MixtureDatabase:
155
157
  return json.loads(c.execute("SELECT top.asr_configs FROM top").fetchone()[0])
156
158
 
157
159
  @cached_property
158
- def supported_metrics(self) -> set[str]:
159
- metrics = {
160
- 'mxssnravg', 'mxssnrvar', 'mxssnrdavg', 'mxssnrdstd',
161
- 'mxpesq', 'mxcsig', 'mxcbak', 'mxcovl', 'mxwsdr',
162
- 'mxpd',
163
- 'mxstoi',
164
- 'tdco', 'tmin', 'tmax', 'tpkdb', 'tlrms', 'tpkr', 'ttr', 'tcr', 'tfl', 'tpkc',
165
- 'ndco', 'nmin', 'nmax', 'npkdb', 'nlrms', 'npkr', 'ntr', 'ncr', 'nfl', 'npkc',
166
- 'sedavg', 'sedcnt', 'sedtopn',
167
- 'ssnr',
168
- }
160
+ def supported_metrics(self) -> MetricDocs:
161
+ metrics = MetricDocs([
162
+ MetricDoc('Mixture Metrics', 'mxsnr', 'SNR specification in dB'),
163
+ MetricDoc('Mixture Metrics', 'mxssnr_avg', 'Segmental SNR average over all frames'),
164
+ MetricDoc('Mixture Metrics', 'mxssnr_std', 'Segmental SNR standard deviation over all frames'),
165
+ MetricDoc('Mixture Metrics', 'mxssnrdb_avg',
166
+ 'Segmental SNR average of the dB frame values over all frames'),
167
+ MetricDoc('Mixture Metrics', 'mxssnrdb_std',
168
+ 'Segmental SNR standard deviation of the dB frame values over all frames'),
169
+ MetricDoc('Mixture Metrics', 'mxssnrf_avg',
170
+ 'Per-bin segmental SNR average over all frames (using feature transform)'),
171
+ MetricDoc('Mixture Metrics', 'mxssnrf_std',
172
+ 'Per-bin segmental SNR standard deviation over all frames (using feature transform)'),
173
+ MetricDoc('Mixture Metrics', 'mxssnrdbf_avg',
174
+ 'Per-bin segmental average of the dB frame values over all frames (using feature transform)'),
175
+ MetricDoc('Mixture Metrics', 'mxssnrdbf_std',
176
+ 'Per-bin segmental standard deviation of the dB frame values over all frames (using feature transform)'),
177
+ MetricDoc('Mixture Metrics', 'mxpesq', 'PESQ of mixture versus true target[0]'),
178
+ MetricDoc('Mixture Metrics', 'mxwsdr', 'Weighted signal distorion ratio of mixture versus true target[0]'),
179
+ MetricDoc('Mixture Metrics', 'mxpd', 'Phase distance between mixture and true target[0]'),
180
+ MetricDoc('Mixture Metrics', 'mxstoi',
181
+ 'Short term objective intelligibility of mixture versus true target[0]'),
182
+ MetricDoc('Mixture Metrics', 'mxcsig',
183
+ 'Predicted rating of speech distortion of mixture versus true target[0]'),
184
+ MetricDoc('Mixture Metrics', 'mxcbak',
185
+ 'Predicted rating of background distortion of mixture versus true target[0]'),
186
+ MetricDoc('Mixture Metrics', 'mxcovl',
187
+ 'Predicted rating of overall quality of mixture versus true target[0]'),
188
+ MetricDoc('Mixture Metrics', 'ssnr', 'Segmental SNR'),
189
+ MetricDoc('Target Metrics', 'tdco', 'Target[0] DC offset'),
190
+ MetricDoc('Target Metrics', 'tmin', 'Target[0] min level'),
191
+ MetricDoc('Target Metrics', 'tmax', 'Target[0] max levl'),
192
+ MetricDoc('Target Metrics', 'tpkdb', 'Target[0] Pk lev dB'),
193
+ MetricDoc('Target Metrics', 'tlrms', 'Target[0] RMS lev dB'),
194
+ MetricDoc('Target Metrics', 'tpkr', 'Target[0] RMS Pk dB'),
195
+ MetricDoc('Target Metrics', 'ttr', 'Target[0] RMS Tr dB'),
196
+ MetricDoc('Target Metrics', 'tcr', 'Target[0] Crest factor'),
197
+ MetricDoc('Target Metrics', 'tfl', 'Target[0] Flat factor'),
198
+ MetricDoc('Target Metrics', 'tpkc', 'Target[0] Pk count'),
199
+ MetricDoc('Noise Metrics', 'ndco', 'Noise DC offset'),
200
+ MetricDoc('Noise Metrics', 'nmin', 'Noise min level'),
201
+ MetricDoc('Noise Metrics', 'nmax', 'Noise max levl'),
202
+ MetricDoc('Noise Metrics', 'npkdb', 'Noise Pk lev dB'),
203
+ MetricDoc('Noise Metrics', 'nlrms', 'Noise RMS lev dB'),
204
+ MetricDoc('Noise Metrics', 'npkr', 'Noise RMS Pk dB'),
205
+ MetricDoc('Noise Metrics', 'ntr', 'Noise RMS Tr dB'),
206
+ MetricDoc('Noise Metrics', 'ncr', 'Noise Crest factor'),
207
+ MetricDoc('Noise Metrics', 'nfl', 'Noise Flat factor'),
208
+ MetricDoc('Noise Metrics', 'npkc', 'Noise Pk count'),
209
+ MetricDoc('Truth Metrics', 'sedavg',
210
+ '(not implemented) Average SED activity over all frames [num_classes, 1]'),
211
+ MetricDoc('Truth Metrics', 'sedcnt',
212
+ '(not implemented) Count in number of frames that SED is active [num_classes, 1]'),
213
+ MetricDoc('Truth Metrics', 'sedtop3', '(not implemented) 3 most active by largest sedavg [3, 1]'),
214
+ MetricDoc('Truth Metrics', 'sedtopn', '(not implemented) N most active by largest sedavg [N, 1]'),
215
+ ])
169
216
  for name in self.asr_configs:
170
- metrics.add(f'mxwer.{name}')
217
+ metrics.append(MetricDoc('Mixture Metrics', f'mxwer.{name}',
218
+ f'Word error rate using {name} ASR as defined in mixdb asr_configs parameter'))
171
219
 
172
220
  return metrics
173
221
 
@@ -240,11 +288,15 @@ class MixtureDatabase:
240
288
  def total_feature_frames(self, m_ids: GeneralizedIDs = '*') -> int:
241
289
  return self.total_samples(m_ids) // self.feature_step_samples
242
290
 
243
- def mixture_transform_frames(self, samples: int) -> int:
244
- return samples // self.ft_config.R
291
+ def mixture_transform_frames(self, m_id: int) -> int:
292
+ from .helpers import frames_from_samples
245
293
 
246
- def mixture_feature_frames(self, samples: int) -> int:
247
- return samples // self.feature_step_samples
294
+ return frames_from_samples(self.mixture(m_id).samples, self.ft_config.R)
295
+
296
+ def mixture_feature_frames(self, m_id: int) -> int:
297
+ from .helpers import frames_from_samples
298
+
299
+ return frames_from_samples(self.mixture(m_id).samples, self.feature_step_samples)
248
300
 
249
301
  def mixids_to_list(self, m_ids: Optional[GeneralizedIDs] = None) -> list[int]:
250
302
  """Resolve generalized mixture IDs to a list of integers
@@ -907,8 +959,8 @@ class MixtureDatabase:
907
959
  truth_t = self.mixture_truth_t(m_id=m_id, targets=targets, noise=noise, force=force)
908
960
 
909
961
  m = self.mixture(m_id)
910
- transform_frames = self.mixture_transform_frames(m.samples)
911
- feature_frames = self.mixture_feature_frames(m.samples)
962
+ transform_frames = self.mixture_transform_frames(m_id)
963
+ feature_frames = self.mixture_feature_frames(m_id)
912
964
 
913
965
  if truth_t is None:
914
966
  truth_t = np.zeros((m.samples, self.num_classes), dtype=np.float32)
@@ -1149,7 +1201,8 @@ class MixtureDatabase:
1149
1201
  from sonusai import SonusAIError
1150
1202
  from sonusai.metrics import calc_audio_stats
1151
1203
  from sonusai.metrics import calc_phase_distance
1152
- from sonusai.metrics import calc_snr_f
1204
+ from sonusai.metrics import calc_segsnr_f
1205
+ from sonusai.metrics import calc_segsnr_f_bin
1153
1206
  from sonusai.metrics import calc_speech
1154
1207
  from sonusai.metrics import calc_wer
1155
1208
  from sonusai.metrics import calc_wsdr
@@ -1158,7 +1211,7 @@ class MixtureDatabase:
1158
1211
  from sonusai.mixture import SpeechMetrics
1159
1212
  from sonusai.utils import calc_asr
1160
1213
 
1161
- def create_target_audio() -> Callable:
1214
+ def create_target_audio() -> Callable[[], np.ndarray]:
1162
1215
  state = None
1163
1216
 
1164
1217
  def get() -> np.ndarray:
@@ -1171,7 +1224,20 @@ class MixtureDatabase:
1171
1224
 
1172
1225
  target_audio = create_target_audio()
1173
1226
 
1174
- def create_noise_audio() -> Callable:
1227
+ def create_target_f() -> Callable[[], np.ndarray]:
1228
+ state = None
1229
+
1230
+ def get() -> np.ndarray:
1231
+ nonlocal state
1232
+ if state is None:
1233
+ state = self.mixture_targets_f(m_id)[0]
1234
+ return state
1235
+
1236
+ return get
1237
+
1238
+ target_f = create_target_f()
1239
+
1240
+ def create_noise_audio() -> Callable[[], np.ndarray]:
1175
1241
  state = None
1176
1242
 
1177
1243
  def get() -> np.ndarray:
@@ -1184,7 +1250,20 @@ class MixtureDatabase:
1184
1250
 
1185
1251
  noise_audio = create_noise_audio()
1186
1252
 
1187
- def create_mixture_audio() -> Callable:
1253
+ def create_noise_f() -> Callable[[], np.ndarray]:
1254
+ state = None
1255
+
1256
+ def get() -> np.ndarray:
1257
+ nonlocal state
1258
+ if state is None:
1259
+ state = self.mixture_noise_f(m_id)
1260
+ return state
1261
+
1262
+ return get
1263
+
1264
+ noise_f = create_noise_f()
1265
+
1266
+ def create_mixture_audio() -> Callable[[], np.ndarray]:
1188
1267
  state = None
1189
1268
 
1190
1269
  def get() -> np.ndarray:
@@ -1197,7 +1276,7 @@ class MixtureDatabase:
1197
1276
 
1198
1277
  mixture_audio = create_mixture_audio()
1199
1278
 
1200
- def create_segsnr_f() -> Callable:
1279
+ def create_segsnr_f() -> Callable[[], np.ndarray]:
1201
1280
  state = None
1202
1281
 
1203
1282
  def get() -> np.ndarray:
@@ -1210,7 +1289,7 @@ class MixtureDatabase:
1210
1289
 
1211
1290
  segsnr_f = create_segsnr_f()
1212
1291
 
1213
- def create_speech() -> Callable:
1292
+ def create_speech() -> Callable[[], SpeechMetrics]:
1214
1293
  state = None
1215
1294
 
1216
1295
  def get() -> SpeechMetrics:
@@ -1223,7 +1302,7 @@ class MixtureDatabase:
1223
1302
 
1224
1303
  speech = create_speech()
1225
1304
 
1226
- def create_target_stats() -> Callable:
1305
+ def create_target_stats() -> Callable[[], AudioStatsMetrics]:
1227
1306
  state = None
1228
1307
 
1229
1308
  def get() -> AudioStatsMetrics:
@@ -1236,7 +1315,7 @@ class MixtureDatabase:
1236
1315
 
1237
1316
  target_stats = create_target_stats()
1238
1317
 
1239
- def create_noise_stats() -> Callable:
1318
+ def create_noise_stats() -> Callable[[], AudioStatsMetrics]:
1240
1319
  state = None
1241
1320
 
1242
1321
  def get() -> AudioStatsMetrics:
@@ -1286,17 +1365,29 @@ class MixtureDatabase:
1286
1365
  # TODO: should this be NaN like above?
1287
1366
  return float(0)
1288
1367
 
1289
- if m == 'mxssnravg':
1290
- return calc_snr_f(segsnr_f()).mean
1368
+ if m == 'mxssnr_avg':
1369
+ return calc_segsnr_f(segsnr_f()).avg
1370
+
1371
+ if m == 'mxssnr_std':
1372
+ return calc_segsnr_f(segsnr_f()).std
1373
+
1374
+ if m == 'mxssnrdb_avg':
1375
+ return calc_segsnr_f(segsnr_f()).db_avg
1291
1376
 
1292
- if m == 'mxssnrvar':
1293
- return calc_snr_f(segsnr_f()).var
1377
+ if m == 'mxssnrdb_std':
1378
+ return calc_segsnr_f(segsnr_f()).db_std
1294
1379
 
1295
- if m == 'mxssnrdavg':
1296
- return calc_snr_f(segsnr_f()).db_mean
1380
+ if m == 'mxssnrf_avg':
1381
+ return calc_segsnr_f_bin(target_f(), noise_f()).avg
1297
1382
 
1298
- if m == 'mxssnrdstd':
1299
- return calc_snr_f(segsnr_f()).db_std
1383
+ if m == 'mxssnrf_std':
1384
+ return calc_segsnr_f_bin(target_f(), noise_f()).std
1385
+
1386
+ if m == 'mxssnrdbf_avg':
1387
+ return calc_segsnr_f_bin(target_f(), noise_f()).db_avg
1388
+
1389
+ if m == 'mxssnrdbf_std':
1390
+ return calc_segsnr_f_bin(target_f(), noise_f()).db_std
1300
1391
 
1301
1392
  if m == 'mxpesq':
1302
1393
  if self.mixture(m_id).snr < -96:
@@ -1306,17 +1397,17 @@ class MixtureDatabase:
1306
1397
  if m == 'mxcsig':
1307
1398
  if self.mixture(m_id).snr < -96:
1308
1399
  return 0
1309
- return speech().c_sig
1400
+ return speech().csig
1310
1401
 
1311
1402
  if m == 'mxcbak':
1312
1403
  if self.mixture(m_id).snr < -96:
1313
1404
  return 0
1314
- return speech().c_bak
1405
+ return speech().cbak
1315
1406
 
1316
1407
  if m == 'mxcovl':
1317
1408
  if self.mixture(m_id).snr < -96:
1318
1409
  return 0
1319
- return speech().c_ovl
1410
+ return speech().covl
1320
1411
 
1321
1412
  if m == 'mxwsdr':
1322
1413
  mixture = mixture_audio()[:, np.newaxis]
@@ -1328,8 +1419,7 @@ class MixtureDatabase:
1328
1419
 
1329
1420
  if m == 'mxpd':
1330
1421
  mixture_f = self.mixture_mixture_f(m_id)
1331
- target_f = self.mixture_target_f(m_id)
1332
- return calc_phase_distance(hypothesis=mixture_f, reference=target_f)[0]
1422
+ return calc_phase_distance(hypothesis=mixture_f, reference=target_f())[0]
1333
1423
 
1334
1424
  if m == 'mxstoi':
1335
1425
  return stoi(x=target_audio(), y=mixture_audio(), fs_sig=SAMPLE_RATE, extended=False)
@@ -1400,11 +1490,14 @@ class MixtureDatabase:
1400
1490
  if m == 'sedcnt':
1401
1491
  return 0
1402
1492
 
1493
+ if m == 'sedtop3':
1494
+ return np.zeros(3, dtype=np.float32)
1495
+
1403
1496
  if m == 'sedtopn':
1404
1497
  return 0
1405
1498
 
1406
1499
  if m == 'ssnr':
1407
- return self.mixture_segsnr(m_id)
1500
+ return segsnr_f()
1408
1501
 
1409
1502
  raise SonusAIError(f"Unrecognized metric: '{m}'")
1410
1503
 
@@ -210,6 +210,131 @@ class Transformer(SoxTransformer):
210
210
 
211
211
  return self
212
212
 
213
+ def build(self,
214
+ input_filepath: Optional[str | Path] = None,
215
+ output_filepath: Optional[str | Path] = None,
216
+ input_array: Optional[np.ndarray] = None,
217
+ sample_rate_in: Optional[float] = None,
218
+ extra_args: Optional[list[str]] = None,
219
+ return_output: bool = False) -> tuple[bool, Optional[str], Optional[str]]:
220
+ """Given an input file or array, creates an output_file on disk by
221
+ executing the current set of commands. This function returns True on
222
+ success. If return_output is True, this function returns a triple of
223
+ (status, out, err), giving the success state, along with stdout and
224
+ stderr returned by sox.
225
+
226
+ Parameters
227
+ ----------
228
+ input_filepath : str or None
229
+ Either path to input audio file or None for array input.
230
+ output_filepath : str
231
+ Path to desired output file. If a file already exists at
232
+ the given path, the file will be overwritten.
233
+ If '-n', no file is created.
234
+ input_array : np.ndarray or None
235
+ An np.ndarray of an waveform with shape (n_samples, n_channels).
236
+ sample_rate_in must also be provided.
237
+ If None, input_filepath must be specified.
238
+ sample_rate_in : int
239
+ Sample rate of input_array.
240
+ This argument is ignored if input_array is None.
241
+ extra_args : list or None, default=None
242
+ If a list is given, these additional arguments are passed to SoX
243
+ at the end of the list of effects.
244
+ Don't use this argument unless you know exactly what you're doing!
245
+ return_output : bool, default=False
246
+ If True, returns the status and information sent to stderr and
247
+ stdout as a tuple (status, stdout, stderr).
248
+ If output_filepath is None, return_output=True by default.
249
+ If False, returns True on success.
250
+
251
+ Returns
252
+ -------
253
+ status : bool
254
+ True on success.
255
+ out : str (optional)
256
+ This is not returned unless return_output is True.
257
+ When returned, captures the stdout produced by sox.
258
+ err : str (optional)
259
+ This is not returned unless return_output is True.
260
+ When returned, captures the stderr produced by sox.
261
+
262
+ Examples
263
+ --------
264
+ > import numpy as np
265
+ > import sox
266
+ > tfm = sox.Transformer()
267
+ > sample_rate = 44100
268
+ > y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
269
+
270
+ file in, file out - basic usage
271
+
272
+ > status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
273
+
274
+ file in, file out - equivalent usage
275
+
276
+ > status = tfm.build(
277
+ input_filepath='path/to/input.wav',
278
+ output_filepath='path/to/output.mp3'
279
+ )
280
+
281
+ array in, file out
282
+
283
+ > status = tfm.build(
284
+ input_array=y, sample_rate_in=sample_rate,
285
+ output_filepath='path/to/output.mp3'
286
+ )
287
+
288
+ """
289
+ from sox import file_info
290
+ from sox.core import SoxError
291
+ from sox.core import sox
292
+ from sox.log import logger
293
+
294
+ input_format, input_filepath = self._parse_inputs(
295
+ input_filepath, input_array, sample_rate_in
296
+ )
297
+
298
+ if output_filepath is None:
299
+ raise ValueError("output_filepath is not specified!")
300
+
301
+ # set output parameters
302
+ if input_filepath == output_filepath:
303
+ raise ValueError(
304
+ "input_filepath must be different from output_filepath."
305
+ )
306
+ file_info.validate_output_file(output_filepath)
307
+
308
+ args = []
309
+ args.extend(self.globals)
310
+ args.extend(self._input_format_args(input_format))
311
+ args.append(input_filepath)
312
+ args.extend(self._output_format_args(self.output_format))
313
+ args.append(output_filepath)
314
+ args.extend(self.effects)
315
+
316
+ if extra_args is not None:
317
+ if not isinstance(extra_args, list):
318
+ raise ValueError("extra_args must be a list.")
319
+ args.extend(extra_args)
320
+
321
+ status, out, err = sox(args, input_array, True)
322
+ if status != 0:
323
+ raise SoxError(
324
+ f"Stdout: {out}\nStderr: {err}"
325
+ )
326
+
327
+ logger.info(
328
+ "Created %s with effects: %s",
329
+ output_filepath,
330
+ " ".join(self.effects_log)
331
+ )
332
+
333
+ if return_output:
334
+ return status, out, err
335
+
336
+ return True, None, None
337
+
213
338
  def build_array(self,
214
339
  input_filepath: Optional[str | Path] = None,
215
340
  input_array: Optional[np.ndarray] = None,
@@ -3,13 +3,14 @@ from sonusai.mixture.datatypes import TruthFunctionConfig
3
3
 
4
4
 
5
5
  class Data:
6
- def __init__(self, target_audio: AudioT,
6
+ def __init__(self,
7
+ target_audio: AudioT,
7
8
  noise_audio: AudioT,
8
9
  mixture_audio: AudioT,
9
10
  config: TruthFunctionConfig) -> None:
10
11
  import numpy as np
11
- from pyaaware import AawareForwardTransform
12
- from pyaaware import AawareInverseTransform
12
+ from sonusai import ForwardTransform
13
+ from sonusai import InverseTransform
13
14
  from pyaaware import FeatureGenerator
14
15
 
15
16
  from sonusai import SonusAIError
@@ -33,25 +34,25 @@ class Data:
33
34
 
34
35
  self.offsets = range(0, len(target_audio), self.frame_size)
35
36
  self.zero_based_indices = [x - 1 for x in config.index]
36
- self.target_fft = AawareForwardTransform(N=fg.ftransform_N,
37
- R=fg.ftransform_R,
38
- bin_start=fg.bin_start,
39
- bin_end=fg.bin_end,
40
- ttype=fg.ftransform_ttype)
41
- self.noise_fft = AawareForwardTransform(N=fg.ftransform_N,
42
- R=fg.ftransform_R,
43
- bin_start=fg.bin_start,
44
- bin_end=fg.bin_end,
45
- ttype=fg.ftransform_ttype)
46
- self.mixture_fft = AawareForwardTransform(N=fg.ftransform_N,
47
- R=fg.ftransform_R,
48
- bin_start=fg.bin_start,
49
- bin_end=fg.bin_end,
50
- ttype=fg.ftransform_ttype)
51
- self.swin = AawareInverseTransform(N=fg.itransform_N,
52
- R=fg.itransform_R,
37
+ self.target_fft = ForwardTransform(N=fg.ftransform_N,
38
+ R=fg.ftransform_R,
53
39
  bin_start=fg.bin_start,
54
40
  bin_end=fg.bin_end,
55
- ttype=fg.itransform_ttype,
56
- gain=np.float32(1)).W
41
+ ttype=fg.ftransform_ttype)
42
+ self.noise_fft = ForwardTransform(N=fg.ftransform_N,
43
+ R=fg.ftransform_R,
44
+ bin_start=fg.bin_start,
45
+ bin_end=fg.bin_end,
46
+ ttype=fg.ftransform_ttype)
47
+ self.mixture_fft = ForwardTransform(N=fg.ftransform_N,
48
+ R=fg.ftransform_R,
49
+ bin_start=fg.bin_start,
50
+ bin_end=fg.bin_end,
51
+ ttype=fg.ftransform_ttype)
52
+ self.swin = InverseTransform(N=fg.itransform_N,
53
+ R=fg.itransform_R,
54
+ bin_start=fg.bin_start,
55
+ bin_end=fg.bin_end,
56
+ ttype=fg.itransform_ttype,
57
+ gain=np.float32(1)).W
57
58
  self.truth = np.zeros((len(target_audio), config.num_classes), dtype=np.float32)
@@ -132,9 +132,11 @@ def energy_t(data: Data) -> Truth:
132
132
  will reflect the total energy over all bins regardless of the feature
133
133
  transform config.
134
134
  """
135
+ import torch
136
+
135
137
  from sonusai import SonusAIError
136
138
 
137
- _, target_energy = data.target_fft.execute_all(data.target_audio)
139
+ target_energy = data.target_fft.execute_all(torch.from_numpy(data.target_audio))[1].numpy()
138
140
  if len(target_energy) != len(data.offsets):
139
141
  raise SonusAIError(f'Number of frames in target_energy, {len(target_energy)},'
140
142
  f' is not number of frames in truth, {len(data.offsets)}')
@@ -21,6 +21,7 @@ should be set to the number of sounds/classes to be detected + 1 for
21
21
  the other class.
22
22
  """
23
23
  import numpy as np
24
+ import torch
24
25
  from pyaaware import SED
25
26
 
26
27
  from sonusai import SonusAIError
@@ -48,7 +49,7 @@ the other class.
48
49
  mutex=data.config.mutex)
49
50
 
50
51
  target_audio = data.target_audio / data.config.target_gain
51
- _, energy_t = data.target_fft.execute_all(target_audio)
52
+ energy_t = data.target_fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
52
53
  if len(energy_t) != len(data.offsets):
53
54
  raise SonusAIError(f'Number of frames in energy_t, {len(energy_t)},'
54
55
  f' is not number of frames in truth, {len(data.offsets)}')
@@ -1,4 +1,4 @@
1
- from pyaaware import ForwardTransform
1
+ from sonusai import ForwardTransform
2
2
 
3
3
  from sonusai.mixture.datatypes import AudioF
4
4
  from sonusai.mixture.datatypes import AudioT
@@ -98,7 +98,6 @@ Output shape: [:, 2 * bins] (stacked real, imag)
98
98
  for idx, offset in enumerate(data.offsets):
99
99
  target_freq, _ = data.target_fft.execute(
100
100
  np.multiply(data.target_audio[offset:offset + data.frame_size], data.swin))
101
- target_freq = target_freq.transpose()
102
101
 
103
102
  indices = slice(offset, offset + data.frame_size)
104
103
  for index in data.zero_based_indices:
@@ -112,10 +111,10 @@ Output shape: [:, 2 * bins] (stacked real, imag)
112
111
 
113
112
 
114
113
  def _execute_fft(audio: AudioT, transform: ForwardTransform, expected_frames: int) -> AudioF:
114
+ import torch
115
115
  from sonusai import SonusAIError
116
116
 
117
- freq, _ = transform.execute_all(audio)
118
- freq = freq.transpose()
117
+ freq = transform.execute_all(torch.from_numpy(audio))[0].numpy()
119
118
  if len(freq) != expected_frames:
120
119
  raise SonusAIError(f'Number of frames, {len(freq)}, is not number of frames expected, {expected_frames}')
121
120
  return freq
@@ -131,7 +131,7 @@ def _process(file: str) -> None:
131
131
 
132
132
  import h5py
133
133
  import numpy as np
134
- from pyaaware import AawareInverseTransform
134
+ from sonusai import InverseTransform
135
135
 
136
136
  from sonusai import SonusAIError
137
137
  from sonusai.mixture import get_audio_from_transform
@@ -147,12 +147,12 @@ def _process(file: str) -> None:
147
147
 
148
148
  output_name = join(MP_GLOBAL.output_dir, splitext(basename(file))[0] + '.wav')
149
149
  audio, _ = get_audio_from_transform(data=predict,
150
- transform=AawareInverseTransform(N=MP_GLOBAL.N,
151
- R=MP_GLOBAL.R,
152
- bin_start=MP_GLOBAL.bin_start,
153
- bin_end=MP_GLOBAL.bin_end,
154
- ttype=MP_GLOBAL.ttype,
155
- gain=np.float32(1)))
150
+ transform=InverseTransform(N=MP_GLOBAL.N,
151
+ R=MP_GLOBAL.R,
152
+ bin_start=MP_GLOBAL.bin_start,
153
+ bin_end=MP_GLOBAL.bin_end,
154
+ ttype=MP_GLOBAL.ttype,
155
+ gain=np.float32(1)))
156
156
  write_audio(name=output_name, audio=float_to_int16(audio))
157
157
 
158
158
 
sonusai/utils/__init__.py CHANGED
@@ -9,6 +9,8 @@ from .audio_devices import get_input_devices
9
9
  from .braced_glob import braced_glob
10
10
  from .braced_glob import braced_iglob
11
11
  from .calculate_input_shape import calculate_input_shape
12
+ from .compress import power_compress
13
+ from .compress import power_uncompress
12
14
  from .convert_string_to_number import convert_string_to_number
13
15
  from .create_timestamp import create_timestamp
14
16
  from .create_ts_name import create_ts_name
@@ -0,0 +1,25 @@
1
+ from sonusai.mixture import AudioF
2
+
3
+
4
+ def power_compress(feature: AudioF) -> AudioF:
5
+ import numpy as np
6
+
7
+ mag = np.abs(feature)
8
+ phase = np.angle(feature)
9
+ mag = mag ** 0.3
10
+ real_compress = mag * np.cos(phase)
11
+ imag_compress = mag * np.sin(phase)
12
+
13
+ return real_compress + 1j * imag_compress
14
+
15
+
16
+ def power_uncompress(feature: AudioF) -> AudioF:
17
+ import numpy as np
18
+
19
+ mag = np.abs(feature)
20
+ phase = np.angle(feature)
21
+ mag = mag ** (1. / 0.3)
22
+ real_uncompress = mag * np.cos(phase)
23
+ imag_uncompress = mag * np.sin(phase)
24
+
25
+ return real_uncompress + 1j * imag_uncompress
sonusai/utils/energy_f.py CHANGED
@@ -1,5 +1,4 @@
1
- from pyaaware import ForwardTransform
2
-
1
+ from sonusai import ForwardTransform
3
2
  from sonusai.mixture import AudioF
4
3
  from sonusai.mixture import AudioT
5
4
  from sonusai.mixture import EnergyF
@@ -19,7 +18,7 @@ def compute_energy_f(frequency_domain: AudioF = None,
19
18
  :return: Frequency domain per-bin energy data [frames, bins]
20
19
  """
21
20
  import numpy as np
22
-
21
+ import torch
23
22
  from sonusai import SonusAIError
24
23
 
25
24
  if frequency_domain is None:
@@ -28,7 +27,7 @@ def compute_energy_f(frequency_domain: AudioF = None,
28
27
  if transform is None:
29
28
  raise SonusAIError('Must provide ForwardTransform object')
30
29
 
31
- frequency_domain, _ = transform.execute_all(time_domain)
30
+ frequency_domain = transform.execute_all(torch.from_numpy(time_domain))[0].numpy()
32
31
 
33
32
  frames, bins = frequency_domain.shape
34
33
  result = np.empty((frames, bins), dtype=np.float32)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonusai
3
- Version: 0.18.6
3
+ Version: 0.18.7
4
4
  Summary: Framework for building deep neural network models for sound, speech, and voice AI
5
5
  Home-page: https://aaware.com
6
6
  License: GPL-3.0-only