sonusai 0.19.10__py3-none-any.whl → 0.20.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/data/genmixdb.yml +4 -2
- sonusai/doc/doc.py +14 -0
- sonusai/ir_metric.py +555 -0
- sonusai/metrics_summary.py +5 -3
- sonusai/mixture/__init__.py +4 -1
- sonusai/mixture/audio.py +103 -12
- sonusai/mixture/augmentation.py +199 -84
- sonusai/mixture/config.py +9 -4
- sonusai/mixture/constants.py +0 -1
- sonusai/mixture/datatypes.py +19 -10
- sonusai/mixture/generation.py +11 -12
- sonusai/mixture/helpers.py +20 -23
- sonusai/mixture/ir_delay.py +63 -0
- sonusai/mixture/mixdb.py +103 -19
- sonusai/mixture/targets.py +3 -6
- sonusai/utils/__init__.py +2 -0
- sonusai/utils/temp_seed.py +13 -0
- {sonusai-0.19.10.dist-info → sonusai-0.20.2.dist-info}/METADATA +2 -2
- {sonusai-0.19.10.dist-info → sonusai-0.20.2.dist-info}/RECORD +21 -23
- {sonusai-0.19.10.dist-info → sonusai-0.20.2.dist-info}/WHEEL +1 -1
- sonusai/mixture/soundfile_audio.py +0 -130
- sonusai/mixture/sox_audio.py +0 -476
- sonusai/mixture/sox_augmentation.py +0 -136
- sonusai/mixture/torchaudio_audio.py +0 -106
- sonusai/mixture/torchaudio_augmentation.py +0 -109
- {sonusai-0.19.10.dist-info → sonusai-0.20.2.dist-info}/entry_points.txt +0 -0
sonusai/data/genmixdb.yml
CHANGED
@@ -23,7 +23,8 @@ truth_configs: { }
|
|
23
23
|
|
24
24
|
asr_manifest: [ ]
|
25
25
|
|
26
|
-
target_augmentations:
|
26
|
+
target_augmentations:
|
27
|
+
- pre:
|
27
28
|
|
28
29
|
class_balancing_augmentation:
|
29
30
|
normalize: -3.5
|
@@ -39,7 +40,8 @@ noises:
|
|
39
40
|
- "${default_noise}"
|
40
41
|
|
41
42
|
noise_augmentations:
|
42
|
-
-
|
43
|
+
- pre:
|
44
|
+
normalize: -3.5
|
43
45
|
|
44
46
|
snrs:
|
45
47
|
- 99
|
sonusai/doc/doc.py
CHANGED
@@ -329,6 +329,20 @@ See 'augmentations' for details on augmentation rules.
|
|
329
329
|
# fmt: on
|
330
330
|
|
331
331
|
|
332
|
+
def doc_target_distortions() -> str:
|
333
|
+
import yaml
|
334
|
+
|
335
|
+
default = f"\nDefault value:\n\n{yaml.dump(get_default_config()['target_distortions'])}"
|
336
|
+
# fmt: off
|
337
|
+
return """
|
338
|
+
'target_distortions' is a mixture database configuration parameter that
|
339
|
+
specifies a list of distortion rules to use for each target.
|
340
|
+
|
341
|
+
See 'augmentations' for details on distortion rules.
|
342
|
+
""" + default
|
343
|
+
# fmt: on
|
344
|
+
|
345
|
+
|
332
346
|
def doc_noises() -> str:
|
333
347
|
default = f"\nDefault value: {get_default_config()['class_balancing']}"
|
334
348
|
# fmt: off
|
sonusai/ir_metric.py
ADDED
@@ -0,0 +1,555 @@
|
|
1
|
+
"""sonusai ir_metric
|
2
|
+
|
3
|
+
usage: ir_metric [-hv] [-n NCPU] IRLOC
|
4
|
+
|
5
|
+
options:
|
6
|
+
-h, --help
|
7
|
+
-v, --verbose Be verbose.
|
8
|
+
-n, --num_process NCPU Number of parallel processes to use [default: auto]
|
9
|
+
|
10
|
+
Calculate delay and gain metrics of impulse response (IR) files <filename>.wav in IRLOC.
|
11
|
+
Metrics include gain and multiple ways to calculate the IR delay:
|
12
|
+
- gmax: max abs(fft(ir,4096))
|
13
|
+
- dcc: cross-correlation of ir with pulse train
|
14
|
+
- dmax: index of max(ir)
|
15
|
+
- dgd: group delay method
|
16
|
+
- dcen: centroid of energy
|
17
|
+
|
18
|
+
Results are written into IRLOC/ir_metrics.txt
|
19
|
+
|
20
|
+
IRLOC directory containing impulse response data in audio files (.wav, .flac, etc.). Only first channel is analyzed.
|
21
|
+
|
22
|
+
"""
|
23
|
+
|
24
|
+
import glob
|
25
|
+
import signal
|
26
|
+
from os.path import abspath
|
27
|
+
from os.path import basename
|
28
|
+
from os.path import commonprefix
|
29
|
+
from os.path import dirname
|
30
|
+
from os.path import isdir
|
31
|
+
from os.path import isfile
|
32
|
+
from os.path import join
|
33
|
+
from os.path import relpath
|
34
|
+
from os.path import splitext
|
35
|
+
|
36
|
+
import matplotlib.pyplot as plt
|
37
|
+
import numpy as np
|
38
|
+
import pandas as pd
|
39
|
+
import soundfile
|
40
|
+
from numpy import fft
|
41
|
+
|
42
|
+
from sonusai.utils import braced_iglob
|
43
|
+
|
44
|
+
|
45
|
+
def signal_handler(_sig, _frame):
|
46
|
+
import sys
|
47
|
+
|
48
|
+
from sonusai import logger
|
49
|
+
|
50
|
+
logger.info("Canceled due to keyboard interrupt")
|
51
|
+
sys.exit(1)
|
52
|
+
|
53
|
+
|
54
|
+
signal.signal(signal.SIGINT, signal_handler)
|
55
|
+
|
56
|
+
|
57
|
+
def tdoa(signal, reference, interp=1, phat=False, fs=1, t_max=None):
|
58
|
+
"""
|
59
|
+
Estimates the shift of array signal with respect to reference
|
60
|
+
using generalized cross-correlation
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
signal: array_like
|
65
|
+
The array whose tdoa is measured
|
66
|
+
reference: array_like
|
67
|
+
The reference array
|
68
|
+
interp: int, optional
|
69
|
+
The interpolation factor for the output array, default 1.
|
70
|
+
phat: bool, optional
|
71
|
+
Apply the PHAT weighting (default False)
|
72
|
+
fs: int or float, optional
|
73
|
+
The sampling frequency of the input arrays, default=1
|
74
|
+
|
75
|
+
Returns
|
76
|
+
-------
|
77
|
+
The estimated delay between the two arrays
|
78
|
+
"""
|
79
|
+
|
80
|
+
signal = np.array(signal)
|
81
|
+
reference = np.array(reference)
|
82
|
+
|
83
|
+
N1 = signal.shape[0]
|
84
|
+
N2 = reference.shape[0]
|
85
|
+
|
86
|
+
r_12 = correlate(signal, reference, interp=interp, phat=phat)
|
87
|
+
|
88
|
+
delay = (np.argmax(np.abs(r_12)) / interp - (N2 - 1)) / fs
|
89
|
+
|
90
|
+
return delay
|
91
|
+
|
92
|
+
|
93
|
+
def correlate(x1, x2, interp=1, phat=False):
|
94
|
+
"""
|
95
|
+
Compute the cross-correlation between x1 and x2
|
96
|
+
|
97
|
+
Parameters
|
98
|
+
----------
|
99
|
+
x1,x2: array_like
|
100
|
+
The data arrays
|
101
|
+
interp: int, optional
|
102
|
+
The interpolation factor for the output array, default 1.
|
103
|
+
phat: bool, optional
|
104
|
+
Apply the PHAT weighting (default False)
|
105
|
+
|
106
|
+
Returns
|
107
|
+
-------
|
108
|
+
The cross-correlation between the two arrays
|
109
|
+
"""
|
110
|
+
|
111
|
+
N1 = x1.shape[0]
|
112
|
+
N2 = x2.shape[0]
|
113
|
+
|
114
|
+
N = N1 + N2 - 1
|
115
|
+
|
116
|
+
X1 = fft.rfft(x1, n=N)
|
117
|
+
X2 = fft.rfft(x2, n=N)
|
118
|
+
|
119
|
+
if phat:
|
120
|
+
eps1 = np.mean(np.abs(X1)) * 1e-10
|
121
|
+
X1 /= np.abs(X1) + eps1
|
122
|
+
eps2 = np.mean(np.abs(X2)) * 1e-10
|
123
|
+
X2 /= np.abs(X2) + eps2
|
124
|
+
|
125
|
+
m = np.minimum(N1, N2)
|
126
|
+
|
127
|
+
out = fft.irfft(X1 * np.conj(X2), n=int(N * interp))
|
128
|
+
|
129
|
+
return np.concatenate([out[-interp * (N2 - 1) :], out[: (interp * N1)]])
|
130
|
+
|
131
|
+
|
132
|
+
def hilbert(u):
|
133
|
+
# N : fft length
|
134
|
+
# M : number of elements to zero out
|
135
|
+
# U : DFT of u
|
136
|
+
# v : IDFT of H(U)
|
137
|
+
|
138
|
+
N = len(u)
|
139
|
+
# take forward Fourier transform
|
140
|
+
U = fft.fft(u)
|
141
|
+
M = N - N // 2 - 1
|
142
|
+
# zero out negative frequency components
|
143
|
+
U[N // 2 + 1 :] = [0] * M
|
144
|
+
# double fft energy except @ DC0
|
145
|
+
U[1 : N // 2] = 2 * U[1 : N // 2]
|
146
|
+
# take inverse Fourier transform
|
147
|
+
v = fft.ifft(U)
|
148
|
+
return v
|
149
|
+
|
150
|
+
|
151
|
+
def measure_rt60(h, fs=1, decay_db=60, energy_thres=1.0, plot=False, rt60_tgt=None):
|
152
|
+
"""
|
153
|
+
RT60 Measurement Routine (taken/modified from Pyroom acoustics.)
|
154
|
+
|
155
|
+
Calculates reverberation time of an impulse response using the Schroeder method [1].
|
156
|
+
Returns:
|
157
|
+
rt60: Reverberation time to -60db (-5db to -65db), will be estimated from rt20 or rt10 if noise floor > -65db
|
158
|
+
edt: Early decay time from 0db to -10db
|
159
|
+
rt10: Reverberation time to -10db (-5db to -15db)
|
160
|
+
rt20: Reverberation time to -20db (-5db to -25db), will be estimated from rt10 if noise floor > -25db
|
161
|
+
floor: 0 if noise floor > -10db or energy curve is not a decay
|
162
|
+
1 if noise floor > -15db and edt is measured, but rt10 estimated from entire energy curve length
|
163
|
+
2 if noise -15db > floor > -25db, rt20 is estimated from measured rt10
|
164
|
+
3 if noise -25db > floor > -65db, rt60 is estimated from measured rt20
|
165
|
+
4 if noise floor < -65db, rt60, edt, rt10, rt20 are all measured
|
166
|
+
Optionally plots some useful information.
|
167
|
+
|
168
|
+
Parameters
|
169
|
+
----------
|
170
|
+
h: array_like
|
171
|
+
The impulse response.
|
172
|
+
fs: float or int, optional
|
173
|
+
The sampling frequency of h (default to 1, i.e., samples).
|
174
|
+
decay_db: float or int, optional
|
175
|
+
The decay in decibels for which we actually estimate the slope and time.
|
176
|
+
Although we want to estimate the RT60, it might not be practical. Instead,
|
177
|
+
we measure the RT10, RT20 or RT30 and extrapolate to RT60.
|
178
|
+
energy_thres: float
|
179
|
+
This should be a value between 0.0 and 1.0.
|
180
|
+
If provided, the fit will be done using a fraction energy_thres of the
|
181
|
+
whole energy. This is useful when there is a long noisy tail for example.
|
182
|
+
plot: bool, optional
|
183
|
+
If set to ``True``, the power decay and different estimated values will
|
184
|
+
be plotted (default False).
|
185
|
+
rt60_tgt: float
|
186
|
+
This parameter can be used to indicate a target RT60 to which we want
|
187
|
+
to compare the estimated value.
|
188
|
+
|
189
|
+
References
|
190
|
+
----------
|
191
|
+
|
192
|
+
[1] M. R. Schroeder, "New Method of Measuring Reverberation Time,"
|
193
|
+
J. Acoust. Soc. Am., vol. 37, no. 3, pp. 409-412, Mar. 1968.
|
194
|
+
"""
|
195
|
+
|
196
|
+
h = np.array(h)
|
197
|
+
fs = float(fs)
|
198
|
+
h = np.abs(hilbert(h)) # hilbert from scratch, see above
|
199
|
+
|
200
|
+
# The power of the impulse response in dB
|
201
|
+
power = h**2
|
202
|
+
# Backward energy integration according to Schroeder
|
203
|
+
energy = np.cumsum(power[::-1])[::-1] # Integration according to Schroeder
|
204
|
+
|
205
|
+
if energy_thres < 1.0:
|
206
|
+
assert 0.0 < energy_thres < 1.0
|
207
|
+
energy -= energy[0] * (1.0 - energy_thres)
|
208
|
+
energy = np.maximum(energy, 0.0)
|
209
|
+
|
210
|
+
# remove the possibly all zero tail
|
211
|
+
i_nz = np.max(np.where(energy > 0)[0])
|
212
|
+
energy = energy[:i_nz]
|
213
|
+
energy_db = 10 * np.log10(energy)
|
214
|
+
energy_db -= energy_db[0] # normalize to first sample assuming it's the peak
|
215
|
+
|
216
|
+
min_energy_db = -np.min(energy_db)
|
217
|
+
if min_energy_db - 5 < decay_db:
|
218
|
+
decay_db = min_energy_db
|
219
|
+
|
220
|
+
# -5 dB headroom
|
221
|
+
try:
|
222
|
+
i_5db = np.min(np.where(energy_db < -5)[0])
|
223
|
+
except ValueError:
|
224
|
+
floor = 0
|
225
|
+
return 0.0, 0.0, 0.0, 0.0, floor # failed, energy curve is not a decay, or has noise floor tail above -5db
|
226
|
+
e_5db = energy_db[i_5db]
|
227
|
+
t_5db = i_5db / fs # This is the initial decay to -5db, used as start of decay slope measurements
|
228
|
+
|
229
|
+
# Estimate slope from 0db to -10db - this is also known as EDT (early decay time)
|
230
|
+
try:
|
231
|
+
i_10db = np.min(np.where(energy_db < -10)[0])
|
232
|
+
except ValueError:
|
233
|
+
floor = 0
|
234
|
+
return 0.0, 0.0, 0.0, 0.0, floor # failed, energy curve is not a decay, or noise floor tail above -10db
|
235
|
+
e_10db = energy_db[i_10db]
|
236
|
+
edt = i_10db / fs # this is also known as EDT (early decay time)
|
237
|
+
|
238
|
+
# after initial decay, estimate RT10, RT20, RT60
|
239
|
+
try:
|
240
|
+
i_decay10db = np.min(np.where(energy_db < -5 - 10)[0])
|
241
|
+
except ValueError:
|
242
|
+
floor = 1
|
243
|
+
i_decay10db = len(energy_db) # noise floor tail is above -15db, use entire curve
|
244
|
+
t10_decay = i_decay10db / fs
|
245
|
+
rt10 = t10_decay - t_5db
|
246
|
+
|
247
|
+
try:
|
248
|
+
i_decay20db = np.min(np.where(energy_db < -5 - 20)[0])
|
249
|
+
except ValueError:
|
250
|
+
floor = 2
|
251
|
+
i_decay20db = len(energy_db) # noise floor tail is above -20db, use entire curve
|
252
|
+
t20_decay = i_decay20db / fs
|
253
|
+
rt20 = t20_decay - t_5db
|
254
|
+
|
255
|
+
try:
|
256
|
+
i_decay60db = np.min(np.where(energy_db < -5 - 60)[0])
|
257
|
+
t60_decay = i_decay60db / fs
|
258
|
+
rt60 = t60_decay - t_5db
|
259
|
+
floor = 4
|
260
|
+
except ValueError:
|
261
|
+
floor = 3
|
262
|
+
i_decay60db = len(energy_db) # noise floor tail is above -60db, use t20_decay to estimate
|
263
|
+
t60_decay = 3 * i_decay20db / fs
|
264
|
+
rt60 = t60_decay - t_5db
|
265
|
+
|
266
|
+
# # extropolate to compute the rt60 decay time from decay_db decay time
|
267
|
+
# decay_time = t_decay - t_5db
|
268
|
+
# est_rt60 = (60 / decay_db) * decay_time
|
269
|
+
|
270
|
+
if plot:
|
271
|
+
# Remove clip power below to minimum energy (for plotting purpose mostly)
|
272
|
+
energy_min = energy[-1]
|
273
|
+
energy_db_min = energy_db[-1]
|
274
|
+
power[power < energy[-1]] = energy_min
|
275
|
+
power_db = 10 * np.log10(power)
|
276
|
+
power_db -= np.max(power_db)
|
277
|
+
|
278
|
+
# time vector
|
279
|
+
def get_time(x, fs):
|
280
|
+
return np.arange(x.shape[0]) / fs - i_5db / fs
|
281
|
+
|
282
|
+
T = get_time(power_db, fs)
|
283
|
+
|
284
|
+
# plot power and energy
|
285
|
+
plt.plot(get_time(energy_db, fs), energy_db, label="Energy")
|
286
|
+
|
287
|
+
# now the linear fit
|
288
|
+
plt.plot([0, rt60], [e_5db, -65], "--", label="Linear Fit")
|
289
|
+
plt.plot(T, np.ones_like(T) * -60, "--", label="-60 dB")
|
290
|
+
plt.vlines(rt60, energy_db_min, 0, linestyles="dashed", label="Estimated RT60")
|
291
|
+
|
292
|
+
if rt60_tgt is not None:
|
293
|
+
plt.vlines(rt60_tgt, energy_db_min, 0, label="Target RT60")
|
294
|
+
|
295
|
+
plt.legend()
|
296
|
+
|
297
|
+
return rt60, edt, rt10, rt20, floor
|
298
|
+
|
299
|
+
|
300
|
+
def process_path(path, extlist=[".wav", ".WAV", ".flac", ".FLAC", ".mp3", ".aac"]):
|
301
|
+
"""
|
302
|
+
Check path which can be a single file, a subdirectory, or a regex
|
303
|
+
return:
|
304
|
+
- a list of files with matching extensions to any in extlist provided (i.e. ['.wav', '.mp3', '.acc'])
|
305
|
+
- the basedir of the path, if
|
306
|
+
"""
|
307
|
+
# Check if the path is a single file, and return it as a list with the dirname
|
308
|
+
if isfile(path):
|
309
|
+
if any(path.endswith(ext) for ext in extlist):
|
310
|
+
basedir = dirname(path) # base directory
|
311
|
+
if not basedir:
|
312
|
+
basedir = "./"
|
313
|
+
return [path], basedir
|
314
|
+
else:
|
315
|
+
return [], []
|
316
|
+
|
317
|
+
# Check if the path is a dir, recursively find all files any of the specified extensions, return file list and dir
|
318
|
+
if isdir(path):
|
319
|
+
matching_files = []
|
320
|
+
for ext in extlist:
|
321
|
+
matching_files.extend(glob.glob(join(path, "**/*" + ext), recursive=True))
|
322
|
+
return matching_files, path
|
323
|
+
|
324
|
+
# Process as a regex, return list of filenames and basedir
|
325
|
+
apath = abspath(path) # join(abspath(path), "**", "*.{wav,flac,WAV}")
|
326
|
+
matching_files = []
|
327
|
+
for file in braced_iglob(pathname=apath, recursive=True):
|
328
|
+
matching_files.append(file)
|
329
|
+
if matching_files:
|
330
|
+
basedir = commonprefix(matching_files) # Find basedir
|
331
|
+
return matching_files, basedir
|
332
|
+
else:
|
333
|
+
return [], []
|
334
|
+
|
335
|
+
|
336
|
+
def _process_ir(pfile: str, irtab_col: list, basedir: str) -> pd.DataFrame:
|
337
|
+
# 1) Read ir audio file, and calc basic stats
|
338
|
+
ir_fname = pfile[1] # abs_path
|
339
|
+
irwav, sample_rate = soundfile.read(ir_fname)
|
340
|
+
if irwav.ndim == 2:
|
341
|
+
irwav = irwav[:, 0] # Only first channel of multi-channel
|
342
|
+
duration = len(irwav) / sample_rate
|
343
|
+
srk = sample_rate / 1000
|
344
|
+
ir_basename = relpath(ir_fname, basedir)
|
345
|
+
|
346
|
+
# 2) Compute delay via autocorrelation (not working - always zero, use interplated tdoa instead)
|
347
|
+
# ar = np.correlate(irwav, irwav, mode='same')
|
348
|
+
# acdelay_index = np.argmax(ar)
|
349
|
+
# dacc= acdelay_index - len(ar) // 2 # Center the delay around 0 of 'same' mode
|
350
|
+
|
351
|
+
# 3) Compute delay via max argument - find the peak
|
352
|
+
peak_index = np.argmax(irwav)
|
353
|
+
peak_value = irwav[peak_index]
|
354
|
+
dmax = peak_index
|
355
|
+
|
356
|
+
# 4) Calculate cross-correlation with white gaussian noise ref (ssame as pyrooma.tdoa() with interp=1)
|
357
|
+
np.random.seed(42)
|
358
|
+
wgn_ref = np.random.normal(0, 0.2, int(np.ceil(0.05 * sample_rate))) # (mean,std_dev,length)
|
359
|
+
wgn_conv = np.convolve(irwav, wgn_ref)
|
360
|
+
wgn_corr = np.correlate(wgn_conv, wgn_ref, mode="full") # Compute cross-correlation
|
361
|
+
delay_index = np.argmax(np.abs(wgn_corr)) # Find the delay (need abs??, yes)
|
362
|
+
dcc = delay_index - len(wgn_ref) + 1 # Adjust for the mode='full' shift
|
363
|
+
# GCC with PHAT weighting known to be best, but does seem to mismatch dcc, dmax more frequently
|
364
|
+
dtdoa = tdoa(wgn_conv, wgn_ref, interp=16, phat=True)
|
365
|
+
gdccmax = np.max(np.abs(wgn_conv)) / np.max(np.abs(wgn_ref)) # gain of max value
|
366
|
+
|
367
|
+
# # 4b) Calculate cross-correlation with chirp 20Hz-20KHz
|
368
|
+
# t_end = 2 # 1s
|
369
|
+
# t = np.linspace(0, t_end, int(t_end * sample_rate))
|
370
|
+
# k = (20 - 20000) / t_end
|
371
|
+
# chrp_phase = 2 * np.pi * (20 * t + 0.5 * k * t ** 2)
|
372
|
+
# chrp = np.cos(chrp_phase)
|
373
|
+
# chrp_convout = np.convolve(irwav,chrp)
|
374
|
+
# chrp_corr = np.correlate(chrp_convout, chrp, mode='full') # Compute cross-correlation
|
375
|
+
# chrp_delay_idx = np.argmax(np.abs(chrp_corr))
|
376
|
+
# dcchr = chrp_delay_idx - len(chrp) + 1
|
377
|
+
# dtdoachr = tdoa(chrp_convout, chrp, interp=16, phat=False)
|
378
|
+
# gdcchrmax = np.max(np.abs(chrp_convout)) / np.max(np.abs(chrp))
|
379
|
+
# #sin_ref = np.sin(2 * np.pi * 500/sample_rate * np.arange(0,sample_rate))
|
380
|
+
|
381
|
+
# # Create a pulse train alternating +1, -1, ... of width PW, spacing PS_ms
|
382
|
+
# PS = int(0.010 * sample_rate) # Spacing between pulses in sec (to samples)
|
383
|
+
# PW = 5 # Pulse width in samples, make sure < PS
|
384
|
+
# PTLEN = int(1 * sample_rate) # Length in sec (to samples)
|
385
|
+
# #sample_vec = np.arange(PTLEN)
|
386
|
+
#
|
387
|
+
# # Construct the pulse train
|
388
|
+
# ptrain_ref = np.zeros(PTLEN)
|
389
|
+
# polarity = 1
|
390
|
+
# for i in range(0, PTLEN, PS):
|
391
|
+
# if polarity == 1:
|
392
|
+
# ptrain_ref[i:(i + PW)] = 1
|
393
|
+
# polarity = -1
|
394
|
+
# else:
|
395
|
+
# ptrain_ref[i:(i + PW)] = -1
|
396
|
+
# polarity = 1
|
397
|
+
#
|
398
|
+
# pt_convout = np.convolve(irwav,ptrain_ref)
|
399
|
+
# pt_corr = np.correlate(pt_convout, ptrain_ref, mode='full') # Compute cross-correlation
|
400
|
+
# pt_delay_idx = np.argmax(np.abs(pt_corr))
|
401
|
+
# dcc = pt_delay_idx - len(ptrain_ref) + 1
|
402
|
+
# dtdoa = tdoa(pt_convout, ptrain_ref, interp=16, phat=True)
|
403
|
+
# gdccptmax = np.max(np.abs(pt_convout)) / np.max(np.abs(ptrain_ref))
|
404
|
+
|
405
|
+
# 5) Calculate delay using group_delay method
|
406
|
+
fft_size = len(irwav)
|
407
|
+
H = np.fft.fft(irwav, n=fft_size)
|
408
|
+
phase = np.unwrap(np.angle(H))
|
409
|
+
freq = np.fft.fftfreq(fft_size) # in samples, using d=1/sampling_rate=1
|
410
|
+
group_delay = -np.gradient(phase) / (2 * np.pi * np.gradient(freq))
|
411
|
+
dagd = np.mean(group_delay[np.isfinite(group_delay)]) # Average group delay
|
412
|
+
gmax = max(np.abs(H))
|
413
|
+
|
414
|
+
rt60, edt, rt10, rt20, nfloor = measure_rt60(irwav, sample_rate, plot=False)
|
415
|
+
|
416
|
+
# 4) Tabulate metrics as single row in table of scalar metrics per mixture
|
417
|
+
# irtab_col = ["dmax", "dcc", "dccphat", "dagd", "gdccmax", "rt20", "rt60", "max", "min", "gmax", "dur", "sr", "irfile"]
|
418
|
+
metr1 = [dmax, dcc, dtdoa, dagd, gdccmax, rt20, rt60, peak_value, min(irwav), gmax, duration, srk, ir_basename]
|
419
|
+
mtab1 = pd.DataFrame([metr1], columns=irtab_col, index=[pfile[0]]) # return tuple of dataframe
|
420
|
+
|
421
|
+
return mtab1
|
422
|
+
|
423
|
+
|
424
|
+
def main():
|
425
|
+
from docopt import docopt
|
426
|
+
|
427
|
+
import sonusai
|
428
|
+
from sonusai.utils import trim_docstring
|
429
|
+
|
430
|
+
args = docopt(trim_docstring(__doc__), version=sonusai.__version__, options_first=True)
|
431
|
+
|
432
|
+
verbose = args["--verbose"]
|
433
|
+
ir_location = args["IRLOC"]
|
434
|
+
num_proc = args["--num_process"]
|
435
|
+
|
436
|
+
import psutil
|
437
|
+
|
438
|
+
from sonusai.utils import create_timestamp
|
439
|
+
from sonusai.utils import par_track
|
440
|
+
from sonusai.utils import track
|
441
|
+
|
442
|
+
# Check location, default ext are ['.wav', '.WAV', '.flac', '.FLAC', '.mp3', '.aac']
|
443
|
+
pfiles, basedir = process_path(ir_location)
|
444
|
+
pfiles = sorted(pfiles, key=basename)
|
445
|
+
|
446
|
+
if pfiles is None or len(pfiles) < 1:
|
447
|
+
print(f"No IR audio files found in {ir_location}, exiting ...")
|
448
|
+
raise SystemExit(1)
|
449
|
+
elif len(pfiles) == 1:
|
450
|
+
print(f"Found single IR audio file {ir_location} , writing to *-irmetric.txt ...")
|
451
|
+
fbase, ext = splitext(basename(pfiles[0]))
|
452
|
+
wlcsv_name = None
|
453
|
+
txt_fname = str(join(basedir, fbase + "-irmetric.txt"))
|
454
|
+
elif len(pfiles) > 1:
|
455
|
+
print(f"Found {len(pfiles)} files under {basedir} for impulse response metric calculations")
|
456
|
+
txt_fname = str(join(basedir, "ir_metric_summary.txt"))
|
457
|
+
wlcsv_name = str(join(basedir, "ir_metric_list.csv"))
|
458
|
+
|
459
|
+
num_cpu = psutil.cpu_count()
|
460
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
461
|
+
print(f"#CPUs: {num_cpu}, current CPU utilization: {cpu_percent}%")
|
462
|
+
print(f"Memory utilization: {psutil.virtual_memory().percent}%")
|
463
|
+
if num_proc == "auto":
|
464
|
+
use_cpu = int(num_cpu * (0.9 - cpu_percent / 100)) # default use 80% of available cpus
|
465
|
+
elif num_proc == "None":
|
466
|
+
use_cpu = None
|
467
|
+
else:
|
468
|
+
use_cpu = min(max(int(num_proc), 1), num_cpu)
|
469
|
+
|
470
|
+
timestamp = create_timestamp()
|
471
|
+
# Individual mixtures use pandas print, set precision to 2 decimal places
|
472
|
+
# pd.set_option('float_format', '{:.2f}'.format)
|
473
|
+
print(f"Calculating metrics for {len(pfiles)} impulse response files using {use_cpu} parallel processes ...")
|
474
|
+
progress = track(total=len(pfiles))
|
475
|
+
if use_cpu is None or len(pfiles) == 1:
|
476
|
+
no_par = True
|
477
|
+
num_cpus = None
|
478
|
+
else:
|
479
|
+
no_par = False
|
480
|
+
num_cpus = use_cpu
|
481
|
+
|
482
|
+
from functools import partial
|
483
|
+
|
484
|
+
# Setup pandas table for summarizing ir metrics
|
485
|
+
irtab_col = [
|
486
|
+
"dmax",
|
487
|
+
"dcc",
|
488
|
+
"dccphat",
|
489
|
+
"dagd",
|
490
|
+
"gdccmax",
|
491
|
+
"rt20",
|
492
|
+
"rt60",
|
493
|
+
"max",
|
494
|
+
"min",
|
495
|
+
"gmax",
|
496
|
+
"dur",
|
497
|
+
"sr",
|
498
|
+
"irfile",
|
499
|
+
]
|
500
|
+
idx = range(len(pfiles))
|
501
|
+
llfiles = list(zip(idx, pfiles, strict=False))
|
502
|
+
|
503
|
+
all_metrics_tables = par_track(
|
504
|
+
partial(
|
505
|
+
_process_ir,
|
506
|
+
irtab_col=irtab_col,
|
507
|
+
basedir=basedir,
|
508
|
+
),
|
509
|
+
llfiles,
|
510
|
+
progress=progress,
|
511
|
+
num_cpus=num_cpus,
|
512
|
+
no_par=no_par,
|
513
|
+
)
|
514
|
+
progress.close()
|
515
|
+
|
516
|
+
# progress = tqdm(total=len(pfiles), desc='ir_metric')
|
517
|
+
# if use_cpu is None:
|
518
|
+
# all_metrics_tab = pp_tqdm_imap(_process_mixture, pfiles, progress=progress, no_par=True)
|
519
|
+
# else:
|
520
|
+
# all_metrics_tab = pp_tqdm_imap(_process_mixture, pfiles, progress=progress, num_cpus=use_cpu)
|
521
|
+
# progress.close()
|
522
|
+
|
523
|
+
header_args = {
|
524
|
+
"mode": "a",
|
525
|
+
"encoding": "utf-8",
|
526
|
+
"index": False,
|
527
|
+
"header": False,
|
528
|
+
}
|
529
|
+
table_args = {
|
530
|
+
"mode": "a",
|
531
|
+
"encoding": "utf-8",
|
532
|
+
}
|
533
|
+
|
534
|
+
all_metrics_tab = pd.concat([item for item in all_metrics_tables]) # already sorted by truth filename via idx
|
535
|
+
mtabsort = all_metrics_tab.sort_values(by=["irfile"])
|
536
|
+
|
537
|
+
# Write list to .csv
|
538
|
+
if wlcsv_name:
|
539
|
+
pd.DataFrame([["Timestamp", timestamp]]).to_csv(wlcsv_name, header=False, index=False)
|
540
|
+
pd.DataFrame([f"IR metric list for {ir_location}:"]).to_csv(wlcsv_name, mode="a", header=False, index=False)
|
541
|
+
mtabsort.round(2).to_csv(wlcsv_name, **table_args)
|
542
|
+
|
543
|
+
# Write summary and list to .txt
|
544
|
+
with open(txt_fname, "w") as f:
|
545
|
+
print(f"Timestamp: {timestamp}", file=f)
|
546
|
+
print(f"IR metrics stats over {len(llfiles)} files:", file=f)
|
547
|
+
print(mtabsort.describe().round(3).T.to_string(float_format=lambda x: f"{x:.3f}", index=True), file=f)
|
548
|
+
print("", file=f)
|
549
|
+
print("", file=f)
|
550
|
+
print([f"IR metric list for {ir_location}:"], file=f)
|
551
|
+
print(mtabsort.round(3).to_string(), file=f)
|
552
|
+
|
553
|
+
|
554
|
+
if __name__ == "__main__":
|
555
|
+
main()
|
sonusai/metrics_summary.py
CHANGED
@@ -137,7 +137,9 @@ def main() -> None:
|
|
137
137
|
print(f"Could not open SonusAI mixture database in {location}, exiting ...")
|
138
138
|
return
|
139
139
|
|
140
|
-
|
140
|
+
# Only check first and last mixture in order to save time
|
141
|
+
metrics_present = mixdb.cached_metrics([0, mixdb.num_mixtures - 1])
|
142
|
+
|
141
143
|
num_metrics_present = len(metrics_present)
|
142
144
|
if num_metrics_present < 1:
|
143
145
|
print(f"mixdb reports no pre-generated metrics are present. Nothing to summarize in {location}, exiting ...")
|
@@ -150,7 +152,7 @@ def main() -> None:
|
|
150
152
|
create_file_handler(join(location, "metrics_summary.log"))
|
151
153
|
update_console_handler(verbose)
|
152
154
|
initial_log_messages("metrics_summary")
|
153
|
-
logger.info(f"Logging summary of SonusAI mixture
|
155
|
+
logger.info(f"Logging summary of SonusAI mixture database at {location}")
|
154
156
|
else:
|
155
157
|
update_console_handler(verbose)
|
156
158
|
|
@@ -164,7 +166,7 @@ def main() -> None:
|
|
164
166
|
fsuffix = f"_s{len(mixids)}t{mixdb.num_mixtures}"
|
165
167
|
else:
|
166
168
|
logger.info(
|
167
|
-
f"Summarizing SonusAI mixture
|
169
|
+
f"Summarizing SonusAI mixture database with {mixdb.num_mixtures} mixtures "
|
168
170
|
f"and {num_metrics_present} pre-generated metrics ..."
|
169
171
|
)
|
170
172
|
fsuffix = ""
|
sonusai/mixture/__init__.py
CHANGED
@@ -5,6 +5,7 @@ from .audio import get_duration
|
|
5
5
|
from .audio import get_next_noise
|
6
6
|
from .audio import get_num_samples
|
7
7
|
from .audio import get_sample_rate
|
8
|
+
from .audio import raw_read_audio
|
8
9
|
from .audio import read_audio
|
9
10
|
from .audio import read_ir
|
10
11
|
from .audio import validate_input_file
|
@@ -53,7 +54,9 @@ from .datatypes import AudioF
|
|
53
54
|
from .datatypes import AudioStatsMetrics
|
54
55
|
from .datatypes import AudioT
|
55
56
|
from .datatypes import Augmentation
|
57
|
+
from .datatypes import AugmentationEffects
|
56
58
|
from .datatypes import AugmentationRule
|
59
|
+
from .datatypes import AugmentationRuleEffects
|
57
60
|
from .datatypes import AugmentedTarget
|
58
61
|
from .datatypes import ClassCount
|
59
62
|
from .datatypes import EnergyF
|
@@ -111,10 +114,10 @@ from .helpers import get_transform_from_audio
|
|
111
114
|
from .helpers import inverse_transform
|
112
115
|
from .helpers import mixture_metadata
|
113
116
|
from .helpers import write_mixture_metadata
|
117
|
+
from .ir_delay import get_impulse_response_delay
|
114
118
|
from .log_duration_and_sizes import log_duration_and_sizes
|
115
119
|
from .mixdb import MixtureDatabase
|
116
120
|
from .mixdb import db_file
|
117
|
-
from .sox_audio import Transformer
|
118
121
|
from .spectral_mask import apply_spectral_mask
|
119
122
|
from .target_class_balancing import balance_targets
|
120
123
|
from .targets import get_augmented_target_ids_by_class
|