sonusai 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/calc_metric_spenh.py +524 -166
- sonusai/data_generator/torch_from_mixdb.py +2 -2
- sonusai/mixture/audio.py +6 -2
- sonusai/mixture/torchaudio_audio.py +2 -2
- {sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/METADATA +3 -1
- {sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/RECORD +8 -8
- {sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/WHEEL +0 -0
- {sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/entry_points.txt +0 -0
sonusai/calc_metric_spenh.py
CHANGED
@@ -13,7 +13,7 @@ options:
|
|
13
13
|
-e WER, --wer-method WER Word-Error-Rate method: deepgram, google, aixplain_whisper
|
14
14
|
or whisper (locally run) [default: none]
|
15
15
|
-m WMNAME, --whisper-model Whisper model name used in aixplain_whisper and whisper WER methods.
|
16
|
-
[default:
|
16
|
+
[default: tiny]
|
17
17
|
|
18
18
|
Calculate speech enhancement metrics of prediction data in PLOC using SonusAI mixture data
|
19
19
|
in TLOC as truth/label reference. Metric and extraction data files are written into PLOC.
|
@@ -83,6 +83,7 @@ matplotlib.use('SVG')
|
|
83
83
|
class MPGlobal:
|
84
84
|
mixdb: MixtureDatabase = None
|
85
85
|
predict_location: Location = None
|
86
|
+
predwav_mode: bool = None
|
86
87
|
truth_est_mode: bool = None
|
87
88
|
enable_plot: bool = None
|
88
89
|
enable_wav: bool = None
|
@@ -111,6 +112,304 @@ def power_uncompress(spec):
|
|
111
112
|
return real_uncompress + 1j * imag_uncompress
|
112
113
|
|
113
114
|
|
115
|
+
def snr(clean_speech, processed_speech, sample_rate):
|
116
|
+
# Check the length of the clean and processed speech. Must be the same.
|
117
|
+
clean_length = len(clean_speech)
|
118
|
+
processed_length = len(processed_speech)
|
119
|
+
if clean_length != processed_length:
|
120
|
+
raise ValueError('Both Speech Files must be same length.')
|
121
|
+
|
122
|
+
overall_snr = 10 * np.log10(np.sum(np.square(clean_speech)) / np.sum(np.square(clean_speech - processed_speech)))
|
123
|
+
|
124
|
+
# Global Variables
|
125
|
+
winlength = round(30 * sample_rate / 1000) # window length in samples
|
126
|
+
skiprate = int(np.floor(winlength / 4)) # window skip in samples
|
127
|
+
MIN_SNR = -10 # minimum SNR in dB
|
128
|
+
MAX_SNR = 35 # maximum SNR in dB
|
129
|
+
|
130
|
+
# For each frame of input speech, calculate the Segmental SNR
|
131
|
+
num_frames = int(clean_length / skiprate - (winlength / skiprate)) # number of frames
|
132
|
+
start = 0 # starting sample
|
133
|
+
window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
|
134
|
+
|
135
|
+
segmental_snr = np.empty(num_frames)
|
136
|
+
EPS = np.spacing(1)
|
137
|
+
for frame_count in range(num_frames):
|
138
|
+
# (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
|
139
|
+
clean_frame = clean_speech[start:start + winlength]
|
140
|
+
processed_frame = processed_speech[start:start + winlength]
|
141
|
+
clean_frame = np.multiply(clean_frame, window)
|
142
|
+
processed_frame = np.multiply(processed_frame, window)
|
143
|
+
|
144
|
+
# (2) Compute the Segmental SNR
|
145
|
+
signal_energy = np.sum(np.square(clean_frame))
|
146
|
+
noise_energy = np.sum(np.square(clean_frame - processed_frame))
|
147
|
+
segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + EPS) + EPS)
|
148
|
+
segmental_snr[frame_count] = max(segmental_snr[frame_count], MIN_SNR)
|
149
|
+
segmental_snr[frame_count] = min(segmental_snr[frame_count], MAX_SNR)
|
150
|
+
|
151
|
+
start = start + skiprate
|
152
|
+
|
153
|
+
return overall_snr, segmental_snr
|
154
|
+
|
155
|
+
|
156
|
+
def lpcoeff(speech_frame, model_order):
|
157
|
+
# (1) Compute Autocorrelation Lags
|
158
|
+
winlength = np.size(speech_frame)
|
159
|
+
R = np.empty(model_order + 1)
|
160
|
+
E = np.empty(model_order + 1)
|
161
|
+
for k in range(model_order + 1):
|
162
|
+
R[k] = np.dot(speech_frame[0:winlength - k], speech_frame[k: winlength])
|
163
|
+
|
164
|
+
# (2) Levinson-Durbin
|
165
|
+
a = np.ones(model_order)
|
166
|
+
a_past = np.empty(model_order)
|
167
|
+
rcoeff = np.empty(model_order)
|
168
|
+
E[0] = R[0]
|
169
|
+
for i in range(model_order):
|
170
|
+
a_past[0: i] = a[0: i]
|
171
|
+
sum_term = np.dot(a_past[0: i], R[i:0:-1])
|
172
|
+
rcoeff[i] = (R[i + 1] - sum_term) / E[i]
|
173
|
+
a[i] = rcoeff[i]
|
174
|
+
if i == 0:
|
175
|
+
a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], rcoeff[i])
|
176
|
+
else:
|
177
|
+
a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], rcoeff[i])
|
178
|
+
E[i + 1] = (1 - rcoeff[i] * rcoeff[i]) * E[i]
|
179
|
+
acorr = R
|
180
|
+
refcoeff = rcoeff
|
181
|
+
lpparams = np.concatenate((np.array([1]), -a))
|
182
|
+
return acorr, refcoeff, lpparams
|
183
|
+
|
184
|
+
|
185
|
+
def llr(clean_speech, processed_speech, sample_rate):
|
186
|
+
from scipy.linalg import toeplitz
|
187
|
+
|
188
|
+
# Check the length of the clean and processed speech. Must be the same.
|
189
|
+
clean_length = np.size(clean_speech)
|
190
|
+
processed_length = np.size(processed_speech)
|
191
|
+
if clean_length != processed_length:
|
192
|
+
raise ValueError('Both Speech Files must be same length.')
|
193
|
+
|
194
|
+
# Global Variables
|
195
|
+
winlength = (np.round(30 * sample_rate / 1000)).astype(int) # window length in samples
|
196
|
+
skiprate = (np.floor(winlength / 4)).astype(int) # window skip in samples
|
197
|
+
if sample_rate < 10000:
|
198
|
+
P = 10 # LPC Analysis Order
|
199
|
+
else:
|
200
|
+
P = 16 # this could vary depending on sampling frequency.
|
201
|
+
|
202
|
+
# For each frame of input speech, calculate the Log Likelihood Ratio
|
203
|
+
num_frames = int((clean_length - winlength) / skiprate) # number of frames
|
204
|
+
start = 0 # starting sample
|
205
|
+
window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
|
206
|
+
|
207
|
+
distortion = np.empty(num_frames)
|
208
|
+
for frame_count in range(num_frames):
|
209
|
+
# (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
|
210
|
+
clean_frame = clean_speech[start: start + winlength]
|
211
|
+
processed_frame = processed_speech[start: start + winlength]
|
212
|
+
clean_frame = np.multiply(clean_frame, window)
|
213
|
+
processed_frame = np.multiply(processed_frame, window)
|
214
|
+
|
215
|
+
# (2) Get the autocorrelation lags and LPC parameters used to compute the LLR measure.
|
216
|
+
R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
|
217
|
+
R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
|
218
|
+
|
219
|
+
# (3) Compute the LLR measure
|
220
|
+
numerator = np.dot(np.matmul(A_processed, toeplitz(R_clean)), A_processed)
|
221
|
+
denominator = np.dot(np.matmul(A_clean, toeplitz(R_clean)), A_clean)
|
222
|
+
distortion[frame_count] = np.log(numerator / denominator)
|
223
|
+
start = start + skiprate
|
224
|
+
return distortion
|
225
|
+
|
226
|
+
|
227
|
+
def wss(clean_speech, processed_speech, sample_rate):
|
228
|
+
from scipy.fftpack import fft
|
229
|
+
|
230
|
+
# Check the length of the clean and processed speech, which must be the same.
|
231
|
+
clean_length = np.size(clean_speech)
|
232
|
+
processed_length = np.size(processed_speech)
|
233
|
+
if clean_length != processed_length:
|
234
|
+
raise ValueError('Files must have same length.')
|
235
|
+
|
236
|
+
# Global variables
|
237
|
+
winlength = (np.round(30 * sample_rate / 1000)).astype(int) # window length in samples
|
238
|
+
skiprate = (np.floor(np.divide(winlength, 4))).astype(int) # window skip in samples
|
239
|
+
max_freq = (np.divide(sample_rate, 2)).astype(int) # maximum bandwidth
|
240
|
+
num_crit = 25 # number of critical bands
|
241
|
+
|
242
|
+
USE_FFT_SPECTRUM = 1 # defaults to 10th order LP spectrum
|
243
|
+
n_fft = (np.power(2, np.ceil(np.log2(2 * winlength)))).astype(int)
|
244
|
+
n_fftby2 = (np.multiply(0.5, n_fft)).astype(int) # FFT size/2
|
245
|
+
Kmax = 20.0 # value suggested by Klatt, pg 1280
|
246
|
+
Klocmax = 1.0 # value suggested by Klatt, pg 1280
|
247
|
+
|
248
|
+
# Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
|
249
|
+
cent_freq = np.array([50.0000, 120.000, 190.000, 260.000, 330.000, 400.000, 470.000,
|
250
|
+
540.000, 617.372, 703.378, 798.717, 904.128, 1020.38, 1148.30,
|
251
|
+
1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 2211.08, 2446.71,
|
252
|
+
2701.97, 2978.04, 3276.17, 3597.63])
|
253
|
+
bandwidth = np.array([70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000,
|
254
|
+
77.3724, 86.0056, 95.3398, 105.411, 116.256, 127.914, 140.423,
|
255
|
+
153.823, 168.154, 183.457, 199.776, 217.153, 235.631, 255.255,
|
256
|
+
276.072, 298.126, 321.465, 346.136])
|
257
|
+
|
258
|
+
bw_min = bandwidth[0] # minimum critical bandwidth
|
259
|
+
|
260
|
+
# Set up the critical band filters.
|
261
|
+
# Note here that Gaussianly shaped filters are used.
|
262
|
+
# Also, the sum of the filter weights are equivalent for each critical band filter.
|
263
|
+
# Filter less than -30 dB and set to zero.
|
264
|
+
min_factor = np.exp(-30.0 / (2.0 * 2.303)) # -30 dB point of filter
|
265
|
+
crit_filter = np.empty((num_crit, n_fftby2))
|
266
|
+
for i in range(num_crit):
|
267
|
+
f0 = (cent_freq[i] / max_freq) * n_fftby2
|
268
|
+
bw = (bandwidth[i] / max_freq) * n_fftby2
|
269
|
+
norm_factor = np.log(bw_min) - np.log(bandwidth[i])
|
270
|
+
j = np.arange(n_fftby2)
|
271
|
+
crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
|
272
|
+
cond = np.greater(crit_filter[i, :], min_factor)
|
273
|
+
crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
|
274
|
+
# For each frame of input speech, calculate the Weighted Spectral Slope Measure
|
275
|
+
num_frames = int(clean_length / skiprate - (winlength / skiprate)) # number of frames
|
276
|
+
start = 0 # starting sample
|
277
|
+
window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
|
278
|
+
|
279
|
+
distortion = np.empty(num_frames)
|
280
|
+
for frame_count in range(num_frames):
|
281
|
+
# (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
|
282
|
+
clean_frame = clean_speech[start: start + winlength] / 32768
|
283
|
+
processed_frame = processed_speech[start: start + winlength] / 32768
|
284
|
+
clean_frame = np.multiply(clean_frame, window)
|
285
|
+
processed_frame = np.multiply(processed_frame, window)
|
286
|
+
# (2) Compute the Power Spectrum of Clean and Processed
|
287
|
+
# if USE_FFT_SPECTRUM:
|
288
|
+
clean_spec = np.square(np.abs(fft(clean_frame, n_fft)))
|
289
|
+
processed_spec = np.square(np.abs(fft(processed_frame, n_fft)))
|
290
|
+
|
291
|
+
# (3) Compute Filterbank Output Energies (in dB scale)
|
292
|
+
clean_energy = np.matmul(crit_filter, clean_spec[0:n_fftby2])
|
293
|
+
processed_energy = np.matmul(crit_filter, processed_spec[0:n_fftby2])
|
294
|
+
|
295
|
+
clean_energy = 10 * np.log10(np.maximum(clean_energy, 1E-10))
|
296
|
+
processed_energy = 10 * np.log10(np.maximum(processed_energy, 1E-10))
|
297
|
+
|
298
|
+
# (4) Compute Spectral Slope (dB[i+1]-dB[i])
|
299
|
+
clean_slope = clean_energy[1:num_crit] - clean_energy[0: num_crit - 1]
|
300
|
+
processed_slope = processed_energy[1:num_crit] - processed_energy[0: num_crit - 1]
|
301
|
+
|
302
|
+
# (5) Find the nearest peak locations in the spectra to each critical band.
|
303
|
+
# If the slope is negative, we search to the left. If positive, we search to the right.
|
304
|
+
clean_loc_peak = np.empty(num_crit - 1)
|
305
|
+
processed_loc_peak = np.empty(num_crit - 1)
|
306
|
+
|
307
|
+
for i in range(num_crit - 1):
|
308
|
+
# find the peaks in the clean speech signal
|
309
|
+
if clean_slope[i] > 0: # search to the right
|
310
|
+
n = i
|
311
|
+
while (n < num_crit - 1) and (clean_slope[n] > 0):
|
312
|
+
n = n + 1
|
313
|
+
clean_loc_peak[i] = clean_energy[n - 1]
|
314
|
+
else: # search to the left
|
315
|
+
n = i
|
316
|
+
while (n >= 0) and (clean_slope[n] <= 0):
|
317
|
+
n = n - 1
|
318
|
+
clean_loc_peak[i] = clean_energy[n + 1]
|
319
|
+
|
320
|
+
# find the peaks in the processed speech signal
|
321
|
+
if processed_slope[i] > 0: # search to the right
|
322
|
+
n = i
|
323
|
+
while (n < num_crit - 1) and (processed_slope[n] > 0):
|
324
|
+
n = n + 1
|
325
|
+
processed_loc_peak[i] = processed_energy[n - 1]
|
326
|
+
else: # search to the left
|
327
|
+
n = i
|
328
|
+
while (n >= 0) and (processed_slope[n] <= 0):
|
329
|
+
n = n - 1
|
330
|
+
processed_loc_peak[i] = processed_energy[n + 1]
|
331
|
+
|
332
|
+
# (6) Compute the WSS Measure for this frame. This includes determination of the weighting function.
|
333
|
+
dBMax_clean = np.max(clean_energy)
|
334
|
+
dBMax_processed = np.max(processed_energy)
|
335
|
+
'''
|
336
|
+
The weights are calculated by averaging individual weighting factors from the clean and processed frame.
|
337
|
+
These weights W_clean and W_processed should range from 0 to 1 and place more emphasis on spectral peaks
|
338
|
+
and less emphasis on slope differences in spectral valleys.
|
339
|
+
This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
|
340
|
+
'''
|
341
|
+
Wmax_clean = np.divide(Kmax, Kmax + dBMax_clean - clean_energy[0: num_crit - 1])
|
342
|
+
Wlocmax_clean = np.divide(Klocmax, Klocmax + clean_loc_peak - clean_energy[0: num_crit - 1])
|
343
|
+
W_clean = np.multiply(Wmax_clean, Wlocmax_clean)
|
344
|
+
|
345
|
+
Wmax_processed = np.divide(Kmax, Kmax + dBMax_processed - processed_energy[0: num_crit - 1])
|
346
|
+
Wlocmax_processed = np.divide(Klocmax, Klocmax + processed_loc_peak - processed_energy[0: num_crit - 1])
|
347
|
+
W_processed = np.multiply(Wmax_processed, Wlocmax_processed)
|
348
|
+
|
349
|
+
W = np.divide(np.add(W_clean, W_processed), 2.0)
|
350
|
+
slope_diff = np.subtract(clean_slope, processed_slope)[0: num_crit - 1]
|
351
|
+
distortion[frame_count] = np.dot(W, np.square(slope_diff)) / np.sum(W)
|
352
|
+
# this normalization is not part of Klatt's paper, but helps to normalize the measure.
|
353
|
+
# Here we scale the measure by the sum of the weights.
|
354
|
+
start = start + skiprate
|
355
|
+
return distortion
|
356
|
+
|
357
|
+
|
358
|
+
def calc_speech_metrics(hypothesis: np.ndarray,
|
359
|
+
reference: np.ndarray) -> tuple[float, int, int, int, float]:
|
360
|
+
"""
|
361
|
+
Calculate speech metrics pesq_mos, CSIG, CBAK, COVL, segSNR. These are all related and thus included
|
362
|
+
in one function. Reference: matlab script "compute_metrics.m".
|
363
|
+
|
364
|
+
Usage:
|
365
|
+
pesq, csig, cbak, covl, ssnr = compute_metrics(hypothesis, reference, Fs, path)
|
366
|
+
reference: clean audio as array
|
367
|
+
hypothesis: enhanced audio as array
|
368
|
+
Audio must have sampling rate = 16000 Hz.
|
369
|
+
|
370
|
+
Example call:
|
371
|
+
pesq_output, csig_output, cbak_output, covl_output, ssnr_output = \
|
372
|
+
calc_speech_metrics(predicted_audio, target_audio)
|
373
|
+
"""
|
374
|
+
from sonusai.metrics import calc_pesq
|
375
|
+
|
376
|
+
Fs = 16000
|
377
|
+
|
378
|
+
# compute the WSS measure
|
379
|
+
wss_dist_vec = wss(reference, hypothesis, Fs)
|
380
|
+
wss_dist_vec = np.sort(wss_dist_vec)
|
381
|
+
alpha = 0.95 # value from CMGAN ref implementation
|
382
|
+
wss_dist = np.mean(wss_dist_vec[0: round(np.size(wss_dist_vec) * alpha)])
|
383
|
+
|
384
|
+
# compute the LLR measure
|
385
|
+
llr_dist = llr(reference, hypothesis, Fs)
|
386
|
+
ll_rs = np.sort(llr_dist)
|
387
|
+
llr_len = round(np.size(llr_dist) * alpha)
|
388
|
+
llr_mean = np.mean(ll_rs[0: llr_len])
|
389
|
+
|
390
|
+
# compute the SNRseg
|
391
|
+
snr_dist, segsnr_dist = snr(reference, hypothesis, Fs)
|
392
|
+
snr_mean = snr_dist
|
393
|
+
segSNR = np.mean(segsnr_dist)
|
394
|
+
|
395
|
+
# compute the pesq (use Sonusai wrapper, only fs=16k, mode=wb support)
|
396
|
+
pesq_mos = calc_pesq(hypothesis=hypothesis, reference=reference)
|
397
|
+
# pesq_mos = pesq(sampling_rate1, data1, data2, 'wb')
|
398
|
+
|
399
|
+
# now compute the composite measures
|
400
|
+
CSIG = 3.093 - 1.029 * llr_mean + 0.603 * pesq_mos - 0.009 * wss_dist
|
401
|
+
CSIG = max(1, CSIG)
|
402
|
+
CSIG = min(5, CSIG) # limit values to [1, 5]
|
403
|
+
CBAK = 1.634 + 0.478 * pesq_mos - 0.007 * wss_dist + 0.063 * segSNR
|
404
|
+
CBAK = max(1, CBAK)
|
405
|
+
CBAK = min(5, CBAK) # limit values to [1, 5]
|
406
|
+
COVL = 1.594 + 0.805 * pesq_mos - 0.512 * llr_mean - 0.007 * wss_dist
|
407
|
+
COVL = max(1, COVL)
|
408
|
+
COVL = min(5, COVL) # limit values to [1, 5]
|
409
|
+
|
410
|
+
return pesq_mos, CSIG, CBAK, COVL, segSNR
|
411
|
+
|
412
|
+
|
114
413
|
def mean_square_error(hypothesis: np.ndarray,
|
115
414
|
reference: np.ndarray,
|
116
415
|
squared: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
@@ -191,31 +490,43 @@ def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray
|
|
191
490
|
return err, err_b, err_f
|
192
491
|
|
193
492
|
|
194
|
-
def phase_distance(reference: np.ndarray,
|
195
|
-
|
493
|
+
def phase_distance(reference: np.ndarray,
|
494
|
+
hypothesis: np.ndarray,
|
495
|
+
eps: float = 1e-9) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
496
|
+
"""Calculate weighted phase distance error (weight normalization over bins per frame)
|
196
497
|
|
197
498
|
:param reference: complex [frames, bins]
|
198
499
|
:param hypothesis: complex [frames, bins]
|
500
|
+
:param eps: epsilon value
|
199
501
|
:return: mean, mean per bin, mean per frame
|
200
502
|
"""
|
201
|
-
|
202
|
-
|
203
|
-
rh_angle_diff =
|
503
|
+
ang_diff = np.angle(reference) - np.angle(hypothesis)
|
504
|
+
phd_mod = (ang_diff + np.pi) % (2 * np.pi) - np.pi
|
505
|
+
rh_angle_diff = phd_mod * 180 / np.pi # angle diff in deg
|
506
|
+
|
507
|
+
# Use complex divide to intrinsically keep angle diff +/-180 deg, but avoid div by zero (real hyp)
|
508
|
+
# hyp_real = np.real(hypothesis)
|
509
|
+
# near_zeros = np.real(hyp_real) < eps
|
510
|
+
# hyp_real = hyp_real * (np.logical_not(near_zeros))
|
511
|
+
# hyp_real = hyp_real + (near_zeros * eps)
|
512
|
+
# hypothesis = hyp_real + 1j*np.imag(hypothesis)
|
513
|
+
# rh_angle_diff = np.angle(reference / hypothesis) * 180 / np.pi # angle diff +/-180
|
204
514
|
|
205
515
|
# weighted mean over all (scalar)
|
206
|
-
|
516
|
+
reference_mag = np.abs(reference)
|
517
|
+
ref_weight = reference_mag / (np.sum(reference_mag) + eps) # frames x bins
|
207
518
|
err = np.around(np.sum(ref_weight * rh_angle_diff), 3)
|
208
519
|
|
209
520
|
# weighted mean over frames (value per bin)
|
210
521
|
err_b = np.zeros(reference.shape[1])
|
211
522
|
for bi in range(reference.shape[1]):
|
212
|
-
ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) +
|
523
|
+
ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) + eps)
|
213
524
|
err_b[bi] = np.around(np.sum(ref_weight * rh_angle_diff[:, bi]), 3)
|
214
525
|
|
215
526
|
# weighted mean over bins (value per frame)
|
216
527
|
err_f = np.zeros(reference.shape[0])
|
217
528
|
for fi in range(reference.shape[0]):
|
218
|
-
ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) +
|
529
|
+
ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) + eps)
|
219
530
|
err_f[fi] = np.around(np.sum(ref_weight * rh_angle_diff[fi, :]), 3)
|
220
531
|
|
221
532
|
return err, err_b, err_f
|
@@ -228,6 +539,7 @@ def plot_mixpred(mixture: AudioT,
|
|
228
539
|
predict: Optional[Predict] = None,
|
229
540
|
tp_title: str = '') -> plt.Figure:
|
230
541
|
from sonusai.mixture import SAMPLE_RATE
|
542
|
+
|
231
543
|
num_plots = 2
|
232
544
|
if feature is not None:
|
233
545
|
num_plots += 1
|
@@ -268,8 +580,8 @@ def plot_mixpred(mixture: AudioT,
|
|
268
580
|
|
269
581
|
|
270
582
|
def plot_pdb_predtruth(predict: np.ndarray,
|
271
|
-
truth_f: np.ndarray
|
272
|
-
metric: np.ndarray
|
583
|
+
truth_f: Optional[np.ndarray] = None,
|
584
|
+
metric: Optional[np.ndarray] = None,
|
273
585
|
tp_title: str = '') -> plt.Figure:
|
274
586
|
"""Plot predict and optionally truth and a metric in power db, e.g. applies 10*log10(predict)"""
|
275
587
|
num_plots = 2
|
@@ -320,9 +632,9 @@ def plot_pdb_predtruth(predict: np.ndarray,
|
|
320
632
|
|
321
633
|
def plot_epredtruth(predict: np.ndarray,
|
322
634
|
predict_wav: np.ndarray,
|
323
|
-
truth_f: np.ndarray
|
324
|
-
truth_wav: np.ndarray
|
325
|
-
metric: np.ndarray
|
635
|
+
truth_f: Optional[np.ndarray] = None,
|
636
|
+
truth_wav: Optional[np.ndarray] = None,
|
637
|
+
metric: Optional[np.ndarray] = None,
|
326
638
|
tp_title: str = '') -> plt.Figure:
|
327
639
|
"""Plot predict spectrogram and waveform and optionally truth and a metric)"""
|
328
640
|
num_plots = 2
|
@@ -390,76 +702,91 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
390
702
|
from os.path import splitext
|
391
703
|
|
392
704
|
import h5py
|
393
|
-
from
|
705
|
+
from numpy import inf
|
706
|
+
from pystoi import stoi
|
394
707
|
|
395
708
|
from sonusai import SonusAIError
|
709
|
+
from sonusai import logger
|
396
710
|
from sonusai.metrics import calc_pcm
|
397
|
-
from sonusai.metrics import calc_pesq
|
398
|
-
from sonusai.metrics import calc_sa_sdr
|
399
711
|
from sonusai.metrics import calc_wer
|
400
712
|
from sonusai.metrics import calc_wsdr
|
713
|
+
from sonusai.mixture import forward_transform
|
401
714
|
from sonusai.mixture import inverse_transform
|
715
|
+
from sonusai.mixture import read_audio
|
402
716
|
from sonusai.utils import calc_asr
|
403
717
|
from sonusai.utils import float_to_int16
|
404
718
|
from sonusai.utils import reshape_outputs
|
719
|
+
from sonusai.utils import stack_complex
|
405
720
|
from sonusai.utils import unstack_complex
|
406
721
|
from sonusai.utils import write_wav
|
407
722
|
|
723
|
+
mixdb = MP_GLOBAL.mixdb
|
724
|
+
predict_location = MP_GLOBAL.predict_location
|
725
|
+
predwav_mode = MP_GLOBAL.predwav_mode
|
726
|
+
truth_est_mode = MP_GLOBAL.truth_est_mode
|
727
|
+
enable_plot = MP_GLOBAL.enable_plot
|
728
|
+
enable_wav = MP_GLOBAL.enable_wav
|
729
|
+
wer_method = MP_GLOBAL.wer_method
|
730
|
+
whisper_model = MP_GLOBAL.whisper_model
|
731
|
+
|
408
732
|
# 1) Read predict data, var predict with shape [BatchSize,Classes] or [BatchSize,Tsteps,Classes]
|
409
|
-
output_name = join(
|
733
|
+
output_name = join(predict_location, mixdb.mixture(mixid).name)
|
410
734
|
predict = None
|
411
|
-
if
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
predict = np.array(f['predict'])
|
416
|
-
except Exception as e:
|
417
|
-
raise SonusAIError(f'Error reading {output_name}: {e}')
|
418
|
-
# reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
|
419
|
-
if predict.ndim > 2: # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
|
420
|
-
# logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
|
421
|
-
predict, _ = reshape_outputs(predict=predict, timesteps=predict.shape[1])
|
422
|
-
else:
|
423
|
-
# in truth estimation mode we use the truth instead of prediction to see metrics with perfect input
|
424
|
-
# so don't bother to read prediction and mark outputs with tru suffix, i.e. 0000_truest_*
|
735
|
+
if truth_est_mode:
|
736
|
+
# in truth estimation mode we use the truth in place of prediction to see metrics with perfect input
|
737
|
+
# don't bother to read prediction, and predict var will get assigned to truth later
|
738
|
+
# mark outputs with tru suffix, i.e. 0000_truest_*
|
425
739
|
base_name = splitext(output_name)[0] + '_truest'
|
740
|
+
else:
|
741
|
+
base_name, ext = splitext(output_name) # base_name used later
|
742
|
+
if not predwav_mode:
|
743
|
+
try:
|
744
|
+
with h5py.File(output_name, 'r') as f:
|
745
|
+
predict = np.array(f['predict'])
|
746
|
+
except Exception as e:
|
747
|
+
raise SonusAIError(f'Error reading {output_name}: {e}')
|
748
|
+
# reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
|
749
|
+
if predict.ndim > 2: # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
|
750
|
+
# logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
|
751
|
+
predict, _ = reshape_outputs(predict=predict, truth=None, timesteps=predict.shape[1])
|
752
|
+
else:
|
753
|
+
base_name, ext = splitext(output_name)
|
754
|
+
prfname = join(base_name + '.wav')
|
755
|
+
audio = read_audio(prfname)
|
756
|
+
predict = forward_transform(audio, mixdb.ft_config)
|
757
|
+
if mixdb.feature[0:1] == 'h':
|
758
|
+
predict = power_compress(predict)
|
759
|
+
predict = stack_complex(predict)
|
426
760
|
|
427
761
|
# 2) Collect true target, noise, mixture data, trim to predict size if needed
|
428
|
-
target =
|
429
|
-
target_f =
|
430
|
-
noise =
|
431
|
-
noise_f =
|
432
|
-
mixture =
|
433
|
-
mixture_f =
|
434
|
-
segsnr_f =
|
435
|
-
segsnr_f[segsnr_f ==
|
436
|
-
segsnr_f[segsnr_f == -
|
762
|
+
target = mixdb.mixture_target(mixid)
|
763
|
+
target_f = mixdb.mixture_target_f(mixid, target=target)
|
764
|
+
noise = mixdb.mixture_noise(mixid)
|
765
|
+
noise_f = mixdb.mixture_noise_f(mixid, noise=noise)
|
766
|
+
mixture = mixdb.mixture_mixture(mixid, target=target, noise=noise)
|
767
|
+
mixture_f = mixdb.mixture_mixture_f(mixid, mixture=mixture)
|
768
|
+
segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)
|
769
|
+
segsnr_f[segsnr_f == inf] = 7.944e8 # 99db
|
770
|
+
segsnr_f[segsnr_f == -inf] = 1.258e-10 # -99db
|
437
771
|
# need to use inv-tf to match #samples & latency shift properties of predict inv tf
|
438
|
-
targetfi = inverse_transform(target_f,
|
439
|
-
noisefi = inverse_transform(noise_f,
|
440
|
-
# mixturefi = inverse_transform(mixture_f
|
772
|
+
targetfi = inverse_transform(target_f, mixdb.it_config)
|
773
|
+
noisefi = inverse_transform(noise_f, mixdb.it_config)
|
774
|
+
# mixturefi = mixdb.inverse_transform(mixture_f)
|
441
775
|
|
442
776
|
# gen feature, truth - note feature only used for plots
|
443
777
|
# TBD parse truth_f for different formats and also multi-truth
|
444
|
-
feature, truth_f =
|
445
|
-
truth_type =
|
446
|
-
0].function
|
778
|
+
feature, truth_f = mixdb.mixture_ft(mixid, mixture=mixture)
|
779
|
+
truth_type = mixdb.target(mixdb.mixture(mixid).target_id[0]).truth_settings[0].function
|
447
780
|
if truth_type == 'target_mixture_f':
|
448
781
|
half = truth_f.shape[-1] // 2
|
449
782
|
truth_f = truth_f[..., :half] # extract target_f only
|
450
783
|
|
451
|
-
if
|
452
|
-
raise SonusAIError(f'Error: mixture {mixid} does not have the same number of frames as truth, '
|
453
|
-
f'{target_f.shape[0]} != {truth_f.shape[0]}')
|
454
|
-
|
455
|
-
if not MP_GLOBAL.truth_est_mode:
|
784
|
+
if not truth_est_mode:
|
456
785
|
if predict.shape[0] < target_f.shape[0]: # target_f, truth_f, mixture_f, etc. same size
|
457
786
|
trimf = target_f.shape[0] - predict.shape[0]
|
458
|
-
logger.debug(f'Warning: prediction {
|
459
|
-
f'{predict.shape[0]} < {target_f.shape[0]}'
|
460
|
-
f'trimming {trimf} frames from all truth.')
|
787
|
+
logger.debug(f'Warning: prediction frames less than mixture, trimming {trimf} frames from all truth.')
|
461
788
|
target_f = target_f[0:-trimf, :]
|
462
|
-
targetfi = inverse_transform(target_f,
|
789
|
+
targetfi, _ = inverse_transform(target_f, mixdb.it_config)
|
463
790
|
trimt = target.shape[0] - targetfi.shape[0]
|
464
791
|
target = target[0:-trimt]
|
465
792
|
noise_f = noise_f[0:-trimf, :]
|
@@ -468,30 +795,29 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
468
795
|
mixture = mixture[0:-trimt]
|
469
796
|
truth_f = truth_f[0:-trimf, :]
|
470
797
|
elif predict.shape[0] > target_f.shape[0]:
|
471
|
-
raise SonusAIError(
|
472
|
-
|
798
|
+
raise SonusAIError(
|
799
|
+
f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
|
473
800
|
|
474
801
|
# 3) Extraction - format proper complex and wav estimates and truth (unstack, uncompress, inv tf, etc.)
|
475
|
-
if
|
802
|
+
if truth_est_mode:
|
476
803
|
predict = truth_f # substitute truth for the prediction (for test/debug)
|
477
804
|
predict_complex = unstack_complex(predict) # unstack
|
478
805
|
# if feat has compressed mag and truth does not, compress it
|
479
|
-
if
|
480
|
-
0:10] != 'targetcmpr':
|
806
|
+
if mixdb.feature[0:1] == 'h' and mixdb.target(1).truth_settings[0].function[0:10] != 'targetcmpr':
|
481
807
|
predict_complex = power_compress(predict_complex) # from uncompressed truth
|
482
808
|
else:
|
483
809
|
predict_complex = unstack_complex(predict)
|
484
810
|
|
485
811
|
truth_f_complex = unstack_complex(truth_f)
|
486
|
-
if
|
812
|
+
if mixdb.feature[0:1] == 'h': # 'hn' or 'ha' or 'hd', etc.: # if feat has compressed mag
|
487
813
|
# estimate noise in uncompressed-mag domain
|
488
814
|
noise_est_complex = mixture_f - power_uncompress(predict_complex)
|
489
815
|
predict_complex = power_uncompress(predict_complex) # uncompress if truth is compressed
|
490
816
|
else: # cn, c8, ..
|
491
817
|
noise_est_complex = mixture_f - predict_complex
|
492
818
|
|
493
|
-
target_est_wav = inverse_transform(predict_complex,
|
494
|
-
noise_est_wav = inverse_transform(noise_est_complex,
|
819
|
+
target_est_wav = inverse_transform(predict_complex, mixdb.it_config)
|
820
|
+
noise_est_wav = inverse_transform(noise_est_complex, mixdb.it_config)
|
495
821
|
|
496
822
|
# 4) Metrics
|
497
823
|
# Target/Speech logerr - PSD estimation accuracy symmetric mean log-spectral distortion
|
@@ -509,19 +835,25 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
509
835
|
# Noise td logerr
|
510
836
|
# lerr_nt, lerr_nt_bin, lerr_nt_frame = log_error(noisefi, noise_truth_est_audio)
|
511
837
|
|
512
|
-
# SA-SDR (time-domain source-
|
838
|
+
# # SA-SDR (time-domain source-aggragated SDR)
|
513
839
|
ytrue = np.concatenate((targetfi[:, np.newaxis], noisefi[:, np.newaxis]), axis=1)
|
514
840
|
ypred = np.concatenate((target_est_wav[:, np.newaxis], noise_est_wav[:, np.newaxis]), axis=1)
|
515
|
-
# note: w/o scale is more pessimistic number
|
516
|
-
sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
|
841
|
+
# # note: w/o scale is more pessimistic number
|
842
|
+
# sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
|
843
|
+
target_stoi = stoi(targetfi, target_est_wav, 16000, extended=False)
|
517
844
|
|
518
845
|
wsdr, wsdr_cc, wsdr_cw = calc_wsdr(hypothesis=ypred, reference=ytrue, with_log=True)
|
519
|
-
logger.debug(f'
|
846
|
+
# logger.debug(f'wsdr weight sum for mixid {mixid} = {np.sum(wsdr_cw)}.')
|
847
|
+
# logger.debug(f'wsdr cweights = {wsdr_cw}.')
|
848
|
+
# logger.debug(f'wsdr ccoefs for mixid {mixid} = {wsdr_cc}.')
|
520
849
|
|
521
850
|
# Speech intelligibility measure - PESQ
|
522
|
-
if int(
|
523
|
-
|
524
|
-
|
851
|
+
if int(mixdb.mixture(mixid).snr) > -99:
|
852
|
+
# len = target_est_wav.shape[0]
|
853
|
+
pesq_speech, csig_tg, cbak_tg, covl_tg, sgsnr_tg = calc_speech_metrics(target_est_wav, targetfi)
|
854
|
+
pesq_mixture, csig_mx, cbak_mx, covl_mx, sgsnr_mx = calc_speech_metrics(mixture, target)
|
855
|
+
# pesq_speech_tst = calc_pesq(hypothesis=target_est_wav, reference=target)
|
856
|
+
# pesq_mixture_tst = calc_pesq(hypothesis=mixture, reference=target)
|
525
857
|
# pesq improvement
|
526
858
|
pesq_impr = pesq_speech - pesq_mixture
|
527
859
|
# pesq improvement %
|
@@ -530,12 +862,18 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
530
862
|
pesq_speech = 0
|
531
863
|
pesq_mixture = 0
|
532
864
|
pesq_impr_pc = np.float32(0)
|
865
|
+
csig_mx = 0
|
866
|
+
csig_tg = 0
|
867
|
+
cbak_mx = 0
|
868
|
+
cbak_tg = 0
|
869
|
+
covl_mx = 0
|
870
|
+
covl_tg = 0
|
533
871
|
|
534
872
|
# Calc WER
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
873
|
+
asr_tt = ''
|
874
|
+
asr_mx = ''
|
875
|
+
asr_tge = ''
|
876
|
+
if wer_method == 'none' or mixdb.mixture(mixid).snr == -99: # noise only, ignore/reset target asr
|
539
877
|
wer_mx = float('nan')
|
540
878
|
wer_tge = float('nan')
|
541
879
|
wer_pi = float('nan')
|
@@ -543,13 +881,11 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
543
881
|
if MP_GLOBAL.mixdb.asr_manifests:
|
544
882
|
asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0] # ignore mixup
|
545
883
|
else:
|
546
|
-
asr_tt = calc_asr(target, engine=
|
547
|
-
whisper_model_name=MP_GLOBAL.whisper_model).text # target truth
|
884
|
+
asr_tt = calc_asr(target, engine=wer_method, whisper_model_name=whisper_model).text # target truth
|
548
885
|
|
549
886
|
if asr_tt:
|
550
|
-
asr_mx = calc_asr(mixture, engine=
|
551
|
-
asr_tge = calc_asr(target_est_wav, engine=
|
552
|
-
whisper_model_name=MP_GLOBAL.whisper_model).text
|
887
|
+
asr_mx = calc_asr(mixture, engine=wer_method, whisper_model=whisper_model).text
|
888
|
+
asr_tge = calc_asr(target_est_wav, engine=wer_method, whisper_model=whisper_model).text
|
553
889
|
|
554
890
|
wer_mx = calc_wer(asr_mx, asr_tt).wer * 100 # mixture wer
|
555
891
|
wer_tge = calc_wer(asr_tge, asr_tt).wer * 100 # target estimate wer
|
@@ -561,24 +897,21 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
561
897
|
else:
|
562
898
|
wer_pi = 100 * (wer_mx - wer_tge) / wer_mx
|
563
899
|
else:
|
564
|
-
|
565
|
-
print(f'Warning: mixid {mixid} asr truth is empty, setting to 0% wer')
|
566
|
-
asr_mx = ''
|
567
|
-
asr_tge = ''
|
900
|
+
print(f'Warning: mixid {mixid} asr truth is empty, setting to 0% wer')
|
568
901
|
wer_mx = float(0)
|
569
902
|
wer_tge = float(0)
|
570
903
|
wer_pi = float(0)
|
571
904
|
|
572
905
|
# 5) Save per mixture metric results
|
573
906
|
# Single row in table of scalar metrics per mixture
|
574
|
-
mtable1_col = ['MXSNR', 'MXPESQ', 'PESQ', 'PESQi%', 'MXWER', 'WER', 'WERi%', 'WSDR', '
|
575
|
-
'PCM', 'SPLERR', 'NLERR', 'PD', '
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
basename(
|
907
|
+
mtable1_col = ['MXSNR', 'MXPESQ', 'PESQ', 'PESQi%', 'MXWER', 'WER', 'WERi%', 'WSDR', 'STOI',
|
908
|
+
'PCM', 'SPLERR', 'NLERR', 'PD', 'MXCSIG', 'CSIG', 'MXCBAK', 'CBAK', 'MXCOVL', 'COVL',
|
909
|
+
'SPFILE', 'NFILE']
|
910
|
+
ti = mixdb.mixture(mixid).target_id[0]
|
911
|
+
ni = mixdb.mixture(mixid).noise_id
|
912
|
+
metr1 = [mixdb.mixture(mixid).snr, pesq_mixture, pesq_speech, pesq_impr_pc, wer_mx, wer_tge, wer_pi, wsdr,
|
913
|
+
target_stoi, pcm, lerr_tg, lerr_n, phd, csig_mx, csig_tg, cbak_mx, cbak_tg, covl_mx, covl_tg,
|
914
|
+
basename(mixdb.target(ti).name), basename(mixdb.noise(ni).name)]
|
582
915
|
mtab1 = pd.DataFrame([metr1], columns=mtable1_col, index=[mixid])
|
583
916
|
|
584
917
|
# Stats of per frame estimation metrics
|
@@ -588,7 +921,8 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
588
921
|
'NLERR': lerr_n_frame,
|
589
922
|
'SPD': phd_frame})
|
590
923
|
metr2 = metr2.describe() # Use pandas stat function
|
591
|
-
metr2['SSNR'][1:] = metr2['SSNR'][1:].apply(
|
924
|
+
metr2['SSNR'][1:] = metr2['SSNR'][1:].apply(
|
925
|
+
lambda x: 10 * np.log10(x + 1.01e-10)) # Change SSNR stats to dB, except count
|
592
926
|
# create a single row in multi-column header
|
593
927
|
new_labels = pd.MultiIndex.from_product([metr2.columns,
|
594
928
|
['Avg', 'Min', 'Med', 'Max', 'Std']],
|
@@ -597,7 +931,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
597
931
|
mtab2 = pd.DataFrame(dat1row,
|
598
932
|
index=[mixid],
|
599
933
|
columns=new_labels)
|
600
|
-
mtab2.insert(0, 'MXSNR',
|
934
|
+
mtab2.insert(0, 'MXSNR', mixdb.mixture(mixid).snr, False) # add MXSNR as the first metric column
|
601
935
|
|
602
936
|
all_metrics_table_1 = mtab1 # return to be collected by process
|
603
937
|
all_metrics_table_2 = mtab2 # return to be collected by process
|
@@ -610,41 +944,44 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
610
944
|
print(f'Extraction statistics over {mixture_f.shape[0]} frames:', file=f)
|
611
945
|
print(metr2.round(2).to_string(float_format=lambda x: "{:.2f}".format(x)), file=f)
|
612
946
|
print('', file=f)
|
613
|
-
print(f'Target path: {
|
614
|
-
print(f'Noise path: {
|
615
|
-
if
|
616
|
-
print(f'WER method: {
|
617
|
-
if
|
947
|
+
print(f'Target path: {mixdb.target(ti).name}', file=f)
|
948
|
+
print(f'Noise path: {mixdb.noise(ni).name}', file=f)
|
949
|
+
if wer_method != 'none':
|
950
|
+
print(f'WER method: {wer_method} and whisper model (if used): {whisper_model}', file=f)
|
951
|
+
if mixdb.asr_manifests:
|
618
952
|
print(f'ASR truth from manifest: {asr_tt}', file=f)
|
619
953
|
else:
|
620
954
|
print(f'ASR truth from wer method: {asr_tt}', file=f)
|
621
955
|
print(f'ASR result for mixture: {asr_mx}', file=f)
|
622
956
|
print(f'ASR result for prediction: {asr_tge}', file=f)
|
623
|
-
|
957
|
+
|
958
|
+
print(f'Augmentations: {mixdb.mixture(mixid)}', file=f)
|
624
959
|
|
625
960
|
# 7) write wav files
|
626
|
-
if
|
961
|
+
if enable_wav:
|
627
962
|
write_wav(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
|
628
963
|
write_wav(name=base_name + '_target.wav', audio=float_to_int16(target))
|
964
|
+
# write_wav(name=base_name + '_targetfi.wav', audio=float_to_int16(targetfi))
|
629
965
|
write_wav(name=base_name + '_noise.wav', audio=float_to_int16(noise))
|
630
966
|
write_wav(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
|
631
967
|
write_wav(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
|
632
968
|
|
633
969
|
# debug code to test for perfect reconstruction of the extraction method
|
634
970
|
# note both 75% olsa-hanns and 50% olsa-hann modes checked to have perfect reconstruction
|
635
|
-
# target_r = inverse_transform(target_f
|
636
|
-
# noise_r = inverse_transform(noise_f
|
971
|
+
# target_r = mixdb.inverse_transform(target_f)
|
972
|
+
# noise_r = mixdb.inverse_transform(noise_f)
|
637
973
|
# _write_wav(name=base_name + '_target_r.wav', audio=float_to_int16(target_r))
|
638
974
|
# _write_wav(name=base_name + '_noise_r.wav', audio=float_to_int16(noise_r)) # chk perfect rec
|
639
975
|
|
640
976
|
# 8) Write out plot file
|
641
|
-
if
|
977
|
+
if enable_plot:
|
978
|
+
from matplotlib.backends.backend_pdf import PdfPages
|
642
979
|
plot_fname = base_name + '_metric_spenh.pdf'
|
643
980
|
|
644
981
|
# Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
|
645
982
|
# Original size (frames, stride, num_bands), decimates in stride dimension only if step is > 1
|
646
983
|
# Reshape to get frames*decimated_stride, num_bands
|
647
|
-
step = int(
|
984
|
+
step = int(mixdb.feature_samples / mixdb.feature_step_samples)
|
648
985
|
if feature.ndim != 3:
|
649
986
|
raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, num_bands')
|
650
987
|
|
@@ -656,18 +993,17 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
656
993
|
|
657
994
|
with PdfPages(plot_fname) as pdf:
|
658
995
|
# page1 we always have a mixture and prediction, target optional if truth provided
|
659
|
-
tfunc_name =
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
tfunc_name = tfunc_name + ' (db)'
|
996
|
+
tfunc_name = mixdb.target(1).truth_settings[0].function # first target, assumes all have same
|
997
|
+
if tfunc_name == 'mapped_snr_f':
|
998
|
+
# leave as unmapped snr
|
999
|
+
predplot = predict
|
1000
|
+
tfunc_name = mixdb.target(1).truth_settings[0].function
|
1001
|
+
elif tfunc_name == 'target_f' or 'target_mixture_f':
|
1002
|
+
predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
|
1003
|
+
else:
|
1004
|
+
# use dB scale
|
1005
|
+
predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
|
1006
|
+
tfunc_name = tfunc_name + ' (db)'
|
671
1007
|
|
672
1008
|
mixspec = 20 * np.log10(abs(mixture_f) + np.finfo(np.float32).eps)
|
673
1009
|
pdf.savefig(plot_mixpred(mixture=mixture,
|
@@ -710,7 +1046,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
710
1046
|
return all_metrics_table_1, all_metrics_table_2
|
711
1047
|
|
712
1048
|
|
713
|
-
def main()
|
1049
|
+
def main():
|
714
1050
|
from docopt import docopt
|
715
1051
|
|
716
1052
|
import sonusai
|
@@ -729,17 +1065,20 @@ def main() -> None:
|
|
729
1065
|
truth_location = args['TLOC']
|
730
1066
|
whisper_model = args['--whisper-model'].lower()
|
731
1067
|
|
732
|
-
|
1068
|
+
import glob
|
733
1069
|
from os.path import basename
|
734
1070
|
from os.path import isdir
|
735
1071
|
from os.path import join
|
1072
|
+
from os.path import split
|
736
1073
|
|
737
1074
|
from tqdm import tqdm
|
738
1075
|
|
739
1076
|
from sonusai import create_file_handler
|
740
1077
|
from sonusai import initial_log_messages
|
1078
|
+
from sonusai import logger
|
741
1079
|
from sonusai import update_console_handler
|
742
|
-
from sonusai.mixture import
|
1080
|
+
from sonusai.mixture import DEFAULT_NOISE
|
1081
|
+
from sonusai.mixture import MixtureDatabase
|
743
1082
|
from sonusai.mixture import read_audio
|
744
1083
|
from sonusai.utils import calc_asr
|
745
1084
|
from sonusai.utils import pp_tqdm_imap
|
@@ -749,12 +1088,19 @@ def main() -> None:
|
|
749
1088
|
print(f'The specified predict location {predict_location} is not a valid subdirectory path, exiting ...')
|
750
1089
|
|
751
1090
|
# allpfiles = listdir(predict_location)
|
752
|
-
|
753
|
-
predict_logfile = glob(predict_location + "/*predict.log")
|
754
|
-
|
755
|
-
|
1091
|
+
allpfiles = glob.glob(predict_location + "/*.h5")
|
1092
|
+
predict_logfile = glob.glob(predict_location + "/*predict.log")
|
1093
|
+
predwav_mode = False
|
1094
|
+
if len(allpfiles) <= 0 and not truth_est_mode:
|
1095
|
+
allpfiles = glob.glob(predict_location + "/*.wav") # check for wav files
|
1096
|
+
if len(allpfiles) <= 0:
|
1097
|
+
print(f'Subdirectory {predict_location} has no .h5 or .wav files, exiting ...')
|
1098
|
+
else:
|
1099
|
+
logger.info(f'Found {len(allpfiles)} prediction .wav files.')
|
1100
|
+
predwav_mode = True
|
756
1101
|
else:
|
757
|
-
logger.info(f'Found {len(
|
1102
|
+
logger.info(f'Found {len(allpfiles)} prediction .h5 files.')
|
1103
|
+
|
758
1104
|
if len(predict_logfile) == 0:
|
759
1105
|
logger.info(f'Warning, predict location {predict_location} has no prediction log files.')
|
760
1106
|
else:
|
@@ -767,52 +1113,61 @@ def main() -> None:
|
|
767
1113
|
|
768
1114
|
mixdb = MixtureDatabase(truth_location)
|
769
1115
|
mixids = mixdb.mixids_to_list(mixids)
|
770
|
-
logger.info(
|
771
|
-
|
1116
|
+
logger.info(
|
1117
|
+
f'Found mixdb of {mixdb.num_mixtures} total mixtures, with {mixdb.num_classes} classes in {truth_location}')
|
772
1118
|
logger.info(f'Only running specified subset of {len(mixids)} mixtures')
|
773
1119
|
|
774
1120
|
enable_asr_warmup = False
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
1121
|
+
if wer_method == 'none':
|
1122
|
+
fnb = 'metric_spenh_'
|
1123
|
+
elif wer_method == 'google':
|
1124
|
+
fnb = 'metric_spenh_ggl_'
|
1125
|
+
logger.info(f'WER enabled with method {wer_method}')
|
1126
|
+
enable_asr_warmup = True
|
1127
|
+
elif wer_method == 'deepgram':
|
1128
|
+
fnb = 'metric_spenh_dgram_'
|
1129
|
+
logger.info(f'WER enabled with method {wer_method}')
|
1130
|
+
enable_asr_warmup = True
|
1131
|
+
elif wer_method == 'aixplain_whisper':
|
1132
|
+
fnb = 'metric_spenh_whspx_' + whisper_model + '_'
|
1133
|
+
logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
|
1134
|
+
enable_asr_warmup = True
|
1135
|
+
elif wer_method == 'whisper':
|
1136
|
+
fnb = 'metric_spenh_whspl_' + whisper_model + '_'
|
1137
|
+
logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
|
1138
|
+
enable_asr_warmup = True
|
1139
|
+
elif wer_method == 'aaware_whisper':
|
1140
|
+
fnb = 'metric_spenh_whspaaw_' + whisper_model + '_'
|
1141
|
+
logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
|
1142
|
+
enable_asr_warmup = True
|
1143
|
+
elif wer_method == 'fastwhisper':
|
1144
|
+
fnb = 'metric_spenh_fwhsp_' + whisper_model + '_'
|
1145
|
+
logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
|
1146
|
+
enable_asr_warmup = True
|
1147
|
+
else:
|
1148
|
+
logger.error(f'Unrecognized WER method: {wer_method}')
|
1149
|
+
return
|
797
1150
|
|
798
1151
|
if enable_asr_warmup:
|
1152
|
+
DEFAULT_SPEECH = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
|
799
1153
|
audio = read_audio(DEFAULT_SPEECH)
|
800
|
-
logger.info(f'Warming up
|
1154
|
+
logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
|
801
1155
|
asr_chk = calc_asr(audio, engine=wer_method, whisper_model_name=whisper_model)
|
802
1156
|
logger.info(f'Warmup completed, results {asr_chk}')
|
803
1157
|
|
804
|
-
# Individual mixtures use pandas print, set precision to 2 decimal places
|
805
|
-
# pd.set_option('float_format', '{:.2f}'.format)
|
806
1158
|
MP_GLOBAL.mixdb = mixdb
|
807
1159
|
MP_GLOBAL.predict_location = predict_location
|
1160
|
+
MP_GLOBAL.predwav_mode = predwav_mode
|
808
1161
|
MP_GLOBAL.truth_est_mode = truth_est_mode
|
809
1162
|
MP_GLOBAL.enable_plot = enable_plot
|
810
1163
|
MP_GLOBAL.enable_wav = enable_wav
|
811
1164
|
MP_GLOBAL.wer_method = wer_method
|
812
1165
|
MP_GLOBAL.whisper_model = whisper_model
|
813
1166
|
|
814
|
-
|
815
|
-
|
1167
|
+
# Individual mixtures use pandas print, set precision to 2 decimal places
|
1168
|
+
# pd.set_option('float_format', '{:.2f}'.format)
|
1169
|
+
progress = tqdm(total=len(mixids), desc='calc_metric_spenh')
|
1170
|
+
all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, num_cpus=None)
|
816
1171
|
progress.close()
|
817
1172
|
|
818
1173
|
all_metrics_table_1 = pd.concat([item[0] for item in all_metrics_tables])
|
@@ -842,29 +1197,32 @@ def main() -> None:
|
|
842
1197
|
mtab_snr_summary['PESQi%'] = 100 * (mtab_snr_summary['PESQ'] - mtab_snr_summary['MXPESQ']) / np.maximum(
|
843
1198
|
mtab_snr_summary['MXPESQ'], 0.01)
|
844
1199
|
for i in range(len(mtab_snr_summary)):
|
845
|
-
|
846
|
-
|
847
|
-
if tmp_mxwer == 0.0:
|
848
|
-
if tmp_wer == 0.0:
|
1200
|
+
if mtab_snr_summary['MXWER'].iloc[i] == 0.0:
|
1201
|
+
if mtab_snr_summary['WER'].iloc[i] == 0.0:
|
849
1202
|
mtab_snr_summary['WERi%'].iloc[i] = 0.0
|
850
1203
|
else:
|
851
1204
|
mtab_snr_summary['WERi%'].iloc[i] = -999.0
|
852
1205
|
else:
|
853
|
-
mtab_snr_summary['WERi%'].iloc[i] = 100 * (
|
1206
|
+
mtab_snr_summary['WERi%'].iloc[i] = 100 * (mtab_snr_summary['MXWER'].iloc[i] -
|
1207
|
+
mtab_snr_summary['WER'].iloc[i]) / \
|
1208
|
+
mtab_snr_summary['MXWER'].iloc[i]
|
854
1209
|
|
855
1210
|
# Calculate avg metrics over all mixtures except -99
|
856
1211
|
all_mtab1_sorted_nom99 = all_mtab1_sorted[all_mtab1_sorted.MXSNR != -99]
|
857
1212
|
all_nom99_mean = all_mtab1_sorted_nom99.mean(numeric_only=True)
|
858
1213
|
|
859
1214
|
# correct the percentage averages with a direct calculation (PESQ% and WER%):
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
1215
|
+
# ser.iloc[pos]
|
1216
|
+
all_nom99_mean['PESQi%'] = (100 * (all_nom99_mean['PESQ'] - all_nom99_mean['MXPESQ'])
|
1217
|
+
/ np.maximum(all_nom99_mean['MXPESQ'], 0.01)) # pesq%
|
1218
|
+
# all_nom99_mean[3] = 100 * (all_nom99_mean[2] - all_nom99_mean[1]) / np.maximum(all_nom99_mean[1], 0.01) # pesq%
|
1219
|
+
if all_nom99_mean['MXWER'] == 0.0:
|
1220
|
+
if all_nom99_mean['WER'] == 0.0:
|
1221
|
+
all_nom99_mean['WERi%'] = 0.0
|
864
1222
|
else:
|
865
|
-
all_nom99_mean[
|
866
|
-
else:
|
867
|
-
all_nom99_mean[
|
1223
|
+
all_nom99_mean['WERi%'] = -999.0
|
1224
|
+
else: # wer%
|
1225
|
+
all_nom99_mean['WERi%'] = 100 * (all_nom99_mean['MXWER'] - all_nom99_mean['WER']) / all_nom99_mean['MXWER']
|
868
1226
|
|
869
1227
|
num_mix = len(mixids)
|
870
1228
|
if num_mix > 1:
|
@@ -32,7 +32,7 @@ class MixtureDatabaseDataset(Dataset):
|
|
32
32
|
def __len__(self):
|
33
33
|
return len(self.mixids)
|
34
34
|
|
35
|
-
def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray]:
|
35
|
+
def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray, int]:
|
36
36
|
"""Get data from one mixture
|
37
37
|
"""
|
38
38
|
import random
|
@@ -68,7 +68,7 @@ class MixtureDatabaseDataset(Dataset):
|
|
68
68
|
feature = feature[start:start + self.cut_len]
|
69
69
|
truth = truth[start:start + self.cut_len]
|
70
70
|
|
71
|
-
return feature, truth
|
71
|
+
return feature, truth, idx
|
72
72
|
|
73
73
|
|
74
74
|
class AawareDataLoader(DataLoader):
|
sonusai/mixture/audio.py
CHANGED
@@ -59,6 +59,7 @@ def get_duration(audio: AudioT) -> float:
|
|
59
59
|
:return: Duration of audio in seconds
|
60
60
|
"""
|
61
61
|
from .constants import SAMPLE_RATE
|
62
|
+
|
62
63
|
return len(audio) / SAMPLE_RATE
|
63
64
|
|
64
65
|
|
@@ -66,14 +67,15 @@ def validate_input_file(input_filepath: str) -> None:
|
|
66
67
|
from os.path import exists
|
67
68
|
from os.path import splitext
|
68
69
|
|
69
|
-
from
|
70
|
+
from soundfile import available_formats
|
70
71
|
|
71
72
|
from sonusai import SonusAIError
|
73
|
+
|
72
74
|
if not exists(input_filepath):
|
73
75
|
raise SonusAIError(f'input_filepath {input_filepath} does not exist.')
|
74
76
|
|
75
77
|
ext = splitext(input_filepath)[1][1:].lower()
|
76
|
-
read_formats =
|
78
|
+
read_formats = [item.lower() for item in available_formats().keys()]
|
77
79
|
if ext not in read_formats:
|
78
80
|
raise SonusAIError(f'This installation of SoX cannot process .{ext} files')
|
79
81
|
|
@@ -86,6 +88,7 @@ def read_audio(name: Location) -> AudioT:
|
|
86
88
|
:return: Array of time domain audio data
|
87
89
|
"""
|
88
90
|
from .torchaudio_audio import read_torchaudio_audio
|
91
|
+
|
89
92
|
return read_torchaudio_audio(name)
|
90
93
|
|
91
94
|
|
@@ -97,4 +100,5 @@ def read_ir(name: Location) -> ImpulseResponseData:
|
|
97
100
|
:return: ImpulseResponseData object
|
98
101
|
"""
|
99
102
|
from .torchaudio_audio import read_torchaudio_ir
|
103
|
+
|
100
104
|
return read_torchaudio_ir(name)
|
@@ -21,7 +21,7 @@ def read_torchaudio_ir(name: Location) -> ImpulseResponseData:
|
|
21
21
|
|
22
22
|
# Read impulse response data from audio file
|
23
23
|
try:
|
24
|
-
raw, sample_rate = torchaudio.load(expanded_name)
|
24
|
+
raw, sample_rate = torchaudio.load(expanded_name, backend='soundfile')
|
25
25
|
except Exception as e:
|
26
26
|
if name != expanded_name:
|
27
27
|
raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}): {e}')
|
@@ -58,7 +58,7 @@ def read_torchaudio_audio(name: Location) -> AudioT:
|
|
58
58
|
expanded_name, _ = tokenized_expand(name)
|
59
59
|
|
60
60
|
try:
|
61
|
-
out, samplerate = torchaudio.load(expanded_name)
|
61
|
+
out, samplerate = torchaudio.load(expanded_name, backend='soundfile')
|
62
62
|
out = torch.reshape(out[0, :], (1, out.size()[1]))
|
63
63
|
|
64
64
|
if not samplerate == SAMPLE_RATE:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonusai
|
3
|
-
Version: 0.14.
|
3
|
+
Version: 0.14.2
|
4
4
|
Summary: Framework for building deep neural network models for sound, speech, and voice AI
|
5
5
|
Home-page: https://aaware.com
|
6
6
|
License: GPL-3.0-only
|
@@ -30,9 +30,11 @@ Requires-Dist: paho-mqtt (>=1.6.1,<2.0.0)
|
|
30
30
|
Requires-Dist: pandas (>=2.1.1,<3.0.0)
|
31
31
|
Requires-Dist: pesq (>=0.0.4,<0.0.5)
|
32
32
|
Requires-Dist: pyaaware (>=1.5.3,<2.0.0)
|
33
|
+
Requires-Dist: pystoi (>=0.3.3,<0.4.0)
|
33
34
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
34
35
|
Requires-Dist: scikit-learn (>=1.3.1,<2.0.0)
|
35
36
|
Requires-Dist: sh (>=2.0.6,<3.0.0)
|
37
|
+
Requires-Dist: soundfile (>=0.12.1,<0.13.0)
|
36
38
|
Requires-Dist: sox (>=1.4.1,<2.0.0)
|
37
39
|
Requires-Dist: speechrecognition (>=3.10.0,<4.0.0)
|
38
40
|
Requires-Dist: sqlalchemy[mypy] (>=2.0.22,<3.0.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
sonusai/__init__.py,sha256=KmIJ9wni9d9v5pyu0pUxbacZIHGkAywB9CJwl7JME28,1526
|
2
2
|
sonusai/aawscd_probwrite.py,sha256=GukR5owp_0A3DrqSl9fHWULYgclNft4D5OkHIwfxxkc,3698
|
3
|
-
sonusai/calc_metric_spenh.py,sha256=
|
3
|
+
sonusai/calc_metric_spenh.py,sha256=cE5lexBq6nZHY7-zudqsMsoz5fFYqVAWgKk21dIlHSw,60810
|
4
4
|
sonusai/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
sonusai/data/genmixdb.yml,sha256=6C1GUr_0P5_hEAYSn0MLAqoSzDk_rP8TyV0sVMZqz1Q,16233
|
6
6
|
sonusai/data/speech_ma01_01.wav,sha256=PK0vMKg-NR6rPE3KouxHGF6PKXnJCr7AwjMqfu98LUA,76644
|
@@ -8,7 +8,7 @@ sonusai/data/whitenoise.wav,sha256=I2umov0m34y56F9IsIBi1XtE76ZeZaSKDf70cJRe3pI,1
|
|
8
8
|
sonusai/data_generator/__init__.py,sha256=ouCpY5EDV35fKFeKGQfIcU8uE-c3QcuNerTxUA1X5L8,232
|
9
9
|
sonusai/data_generator/dataset_from_mixdb.py,sha256=4eQjyZ2TM2FVgbS9Cy8nevfYMBaIyrmHtUiQzJN19Do,5469
|
10
10
|
sonusai/data_generator/keras_from_mixdb.py,sha256=V5CUsGz-akIYdgQy9ABxwNKMYKv01klA4xtMDveF6uI,6167
|
11
|
-
sonusai/data_generator/torch_from_mixdb.py,sha256=
|
11
|
+
sonusai/data_generator/torch_from_mixdb.py,sha256=lvEe9DDu_rIaoyhv9PW4UAnAWp5N74L8kRfxUhsh7oo,4279
|
12
12
|
sonusai/evaluate.py,sha256=OH9g3l8yD4X-HHUf-qQriznXQJSW0gtf7XO4P-jbo1U,10025
|
13
13
|
sonusai/genft.py,sha256=CnBiQKHQHZMlrq-F1QQJfpw-_45uhyut8cY-O7oDrTk,5557
|
14
14
|
sonusai/genmix.py,sha256=l3n-vvSDtwIvYNw9Ulkn5fgAeoyh7reQgGE4Vvth334,7016
|
@@ -33,7 +33,7 @@ sonusai/metrics/confusion_matrix_summary.py,sha256=3qg6TMKjJeHtNjj2YnNjPFSlMrQXt
|
|
33
33
|
sonusai/metrics/one_hot.py,sha256=lq58zKw0X9sdhJYGEldAkxPFqP3UOYpG_KdxkGHF_3c,13540
|
34
34
|
sonusai/metrics/snr_summary.py,sha256=P4U5_Xr7v9F8kF-rZBnpsVNt3p42rIVS6zmch8yfVfg,5575
|
35
35
|
sonusai/mixture/__init__.py,sha256=xlGw2FXoMZm2ra97GVfpJ-OTOp10d4dly8AXe8eJwhI,5294
|
36
|
-
sonusai/mixture/audio.py,sha256=
|
36
|
+
sonusai/mixture/audio.py,sha256=3pat-AIG_FXiGr3aPRa7DSLzolH3PodVDtve-xUuXfk,3242
|
37
37
|
sonusai/mixture/augmentation.py,sha256=HwYUJCSmRBWhdnzqKz5zZnMANT83GzJkDrPcWUm6jbg,10884
|
38
38
|
sonusai/mixture/class_count.py,sha256=27YDu1puarhp7Rd4EYWGJ-FHP8rAYGd55I6abGqCscY,988
|
39
39
|
sonusai/mixture/config.py,sha256=QrasMP-2NGocse2rF_oYkRluDDPo-czFLDEwKtQ8A54,23629
|
@@ -50,7 +50,7 @@ sonusai/mixture/spectral_mask.py,sha256=qHR2DBpbtz4u1o9sdFMRsUDVUjbof_MRKPW8uY4R
|
|
50
50
|
sonusai/mixture/target_class_balancing.py,sha256=P3gLe2SFos5_N2LWiVFwD-fa_imZH2f1qBiI55BeqXI,4768
|
51
51
|
sonusai/mixture/targets.py,sha256=n7PenQuU0pPM_LLXJHmUZ3VeSGDEk7Kdf8y473Xdm6Q,7395
|
52
52
|
sonusai/mixture/tokenized_shell_vars.py,sha256=gCxw8SQUcal6mqWKF7hOBTgSQmbJUk1nT0Gn3H8GA0U,4705
|
53
|
-
sonusai/mixture/torchaudio_audio.py,sha256=
|
53
|
+
sonusai/mixture/torchaudio_audio.py,sha256=qMYXeOSI8U8zaT9x0knPg1dHWzYmswZk7oFGAMG0Jks,2365
|
54
54
|
sonusai/mixture/torchaudio_augmentation.py,sha256=LrG19X71UYKMr69WNgJs2R4OTt1QBYu_h8WL5a4ERyE,4462
|
55
55
|
sonusai/mixture/truth.py,sha256=Is-nqLXIBM7wjYbS6yzy8mnR8JqxwSabnVHsza0rh_E,1427
|
56
56
|
sonusai/mixture/truth_functions/__init__.py,sha256=82lKYHhLy8KW3gHngrocoqwupGVLVsWdIXdYs3vhjOc,359
|
@@ -114,7 +114,7 @@ sonusai/utils/trim_docstring.py,sha256=dSrtiRsEN4wkkvKBp6WDr13RUypfqZzgH_jOBLs1o
|
|
114
114
|
sonusai/utils/wave.py,sha256=OZe8iVLbKSFv_GdQzLD9hJdBiqimK4FxJ0lVoDbbiqQ,572
|
115
115
|
sonusai/utils/yes_or_no.py,sha256=eMLXBVH0cEahiXY4W2KNORmwNQ-ba10eRtldh0y4NYg,263
|
116
116
|
sonusai/vars.py,sha256=m2AefF0m5bXWGXpJj8Pi42zWL2ydeEj7bkak3GrtMyM,940
|
117
|
-
sonusai-0.14.
|
118
|
-
sonusai-0.14.
|
119
|
-
sonusai-0.14.
|
120
|
-
sonusai-0.14.
|
117
|
+
sonusai-0.14.2.dist-info/METADATA,sha256=RR8bQ-ZUGFqZZJID86OMAAM6N0h7MYpfwJlDYf4t0v4,2819
|
118
|
+
sonusai-0.14.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
119
|
+
sonusai-0.14.2.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
|
120
|
+
sonusai-0.14.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|