sonusai 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@ options:
13
13
  -e WER, --wer-method WER Word-Error-Rate method: deepgram, google, aixplain_whisper
14
14
  or whisper (locally run) [default: none]
15
15
  -m WMNAME, --whisper-model Whisper model name used in aixplain_whisper and whisper WER methods.
16
- [default: small_en]
16
+ [default: tiny]
17
17
 
18
18
  Calculate speech enhancement metrics of prediction data in PLOC using SonusAI mixture data
19
19
  in TLOC as truth/label reference. Metric and extraction data files are written into PLOC.
@@ -83,6 +83,7 @@ matplotlib.use('SVG')
83
83
  class MPGlobal:
84
84
  mixdb: MixtureDatabase = None
85
85
  predict_location: Location = None
86
+ predwav_mode: bool = None
86
87
  truth_est_mode: bool = None
87
88
  enable_plot: bool = None
88
89
  enable_wav: bool = None
@@ -111,6 +112,304 @@ def power_uncompress(spec):
111
112
  return real_uncompress + 1j * imag_uncompress
112
113
 
113
114
 
115
+ def snr(clean_speech, processed_speech, sample_rate):
116
+ # Check the length of the clean and processed speech. Must be the same.
117
+ clean_length = len(clean_speech)
118
+ processed_length = len(processed_speech)
119
+ if clean_length != processed_length:
120
+ raise ValueError('Both Speech Files must be same length.')
121
+
122
+ overall_snr = 10 * np.log10(np.sum(np.square(clean_speech)) / np.sum(np.square(clean_speech - processed_speech)))
123
+
124
+ # Global Variables
125
+ winlength = round(30 * sample_rate / 1000) # window length in samples
126
+ skiprate = int(np.floor(winlength / 4)) # window skip in samples
127
+ MIN_SNR = -10 # minimum SNR in dB
128
+ MAX_SNR = 35 # maximum SNR in dB
129
+
130
+ # For each frame of input speech, calculate the Segmental SNR
131
+ num_frames = int(clean_length / skiprate - (winlength / skiprate)) # number of frames
132
+ start = 0 # starting sample
133
+ window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
134
+
135
+ segmental_snr = np.empty(num_frames)
136
+ EPS = np.spacing(1)
137
+ for frame_count in range(num_frames):
138
+ # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
139
+ clean_frame = clean_speech[start:start + winlength]
140
+ processed_frame = processed_speech[start:start + winlength]
141
+ clean_frame = np.multiply(clean_frame, window)
142
+ processed_frame = np.multiply(processed_frame, window)
143
+
144
+ # (2) Compute the Segmental SNR
145
+ signal_energy = np.sum(np.square(clean_frame))
146
+ noise_energy = np.sum(np.square(clean_frame - processed_frame))
147
+ segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + EPS) + EPS)
148
+ segmental_snr[frame_count] = max(segmental_snr[frame_count], MIN_SNR)
149
+ segmental_snr[frame_count] = min(segmental_snr[frame_count], MAX_SNR)
150
+
151
+ start = start + skiprate
152
+
153
+ return overall_snr, segmental_snr
154
+
155
+
156
+ def lpcoeff(speech_frame, model_order):
157
+ # (1) Compute Autocorrelation Lags
158
+ winlength = np.size(speech_frame)
159
+ R = np.empty(model_order + 1)
160
+ E = np.empty(model_order + 1)
161
+ for k in range(model_order + 1):
162
+ R[k] = np.dot(speech_frame[0:winlength - k], speech_frame[k: winlength])
163
+
164
+ # (2) Levinson-Durbin
165
+ a = np.ones(model_order)
166
+ a_past = np.empty(model_order)
167
+ rcoeff = np.empty(model_order)
168
+ E[0] = R[0]
169
+ for i in range(model_order):
170
+ a_past[0: i] = a[0: i]
171
+ sum_term = np.dot(a_past[0: i], R[i:0:-1])
172
+ rcoeff[i] = (R[i + 1] - sum_term) / E[i]
173
+ a[i] = rcoeff[i]
174
+ if i == 0:
175
+ a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], rcoeff[i])
176
+ else:
177
+ a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], rcoeff[i])
178
+ E[i + 1] = (1 - rcoeff[i] * rcoeff[i]) * E[i]
179
+ acorr = R
180
+ refcoeff = rcoeff
181
+ lpparams = np.concatenate((np.array([1]), -a))
182
+ return acorr, refcoeff, lpparams
183
+
184
+
185
+ def llr(clean_speech, processed_speech, sample_rate):
186
+ from scipy.linalg import toeplitz
187
+
188
+ # Check the length of the clean and processed speech. Must be the same.
189
+ clean_length = np.size(clean_speech)
190
+ processed_length = np.size(processed_speech)
191
+ if clean_length != processed_length:
192
+ raise ValueError('Both Speech Files must be same length.')
193
+
194
+ # Global Variables
195
+ winlength = (np.round(30 * sample_rate / 1000)).astype(int) # window length in samples
196
+ skiprate = (np.floor(winlength / 4)).astype(int) # window skip in samples
197
+ if sample_rate < 10000:
198
+ P = 10 # LPC Analysis Order
199
+ else:
200
+ P = 16 # this could vary depending on sampling frequency.
201
+
202
+ # For each frame of input speech, calculate the Log Likelihood Ratio
203
+ num_frames = int((clean_length - winlength) / skiprate) # number of frames
204
+ start = 0 # starting sample
205
+ window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
206
+
207
+ distortion = np.empty(num_frames)
208
+ for frame_count in range(num_frames):
209
+ # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
210
+ clean_frame = clean_speech[start: start + winlength]
211
+ processed_frame = processed_speech[start: start + winlength]
212
+ clean_frame = np.multiply(clean_frame, window)
213
+ processed_frame = np.multiply(processed_frame, window)
214
+
215
+ # (2) Get the autocorrelation lags and LPC parameters used to compute the LLR measure.
216
+ R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
217
+ R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
218
+
219
+ # (3) Compute the LLR measure
220
+ numerator = np.dot(np.matmul(A_processed, toeplitz(R_clean)), A_processed)
221
+ denominator = np.dot(np.matmul(A_clean, toeplitz(R_clean)), A_clean)
222
+ distortion[frame_count] = np.log(numerator / denominator)
223
+ start = start + skiprate
224
+ return distortion
225
+
226
+
227
+ def wss(clean_speech, processed_speech, sample_rate):
228
+ from scipy.fftpack import fft
229
+
230
+ # Check the length of the clean and processed speech, which must be the same.
231
+ clean_length = np.size(clean_speech)
232
+ processed_length = np.size(processed_speech)
233
+ if clean_length != processed_length:
234
+ raise ValueError('Files must have same length.')
235
+
236
+ # Global variables
237
+ winlength = (np.round(30 * sample_rate / 1000)).astype(int) # window length in samples
238
+ skiprate = (np.floor(np.divide(winlength, 4))).astype(int) # window skip in samples
239
+ max_freq = (np.divide(sample_rate, 2)).astype(int) # maximum bandwidth
240
+ num_crit = 25 # number of critical bands
241
+
242
+ USE_FFT_SPECTRUM = 1 # defaults to 10th order LP spectrum
243
+ n_fft = (np.power(2, np.ceil(np.log2(2 * winlength)))).astype(int)
244
+ n_fftby2 = (np.multiply(0.5, n_fft)).astype(int) # FFT size/2
245
+ Kmax = 20.0 # value suggested by Klatt, pg 1280
246
+ Klocmax = 1.0 # value suggested by Klatt, pg 1280
247
+
248
+ # Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
249
+ cent_freq = np.array([50.0000, 120.000, 190.000, 260.000, 330.000, 400.000, 470.000,
250
+ 540.000, 617.372, 703.378, 798.717, 904.128, 1020.38, 1148.30,
251
+ 1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 2211.08, 2446.71,
252
+ 2701.97, 2978.04, 3276.17, 3597.63])
253
+ bandwidth = np.array([70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000,
254
+ 77.3724, 86.0056, 95.3398, 105.411, 116.256, 127.914, 140.423,
255
+ 153.823, 168.154, 183.457, 199.776, 217.153, 235.631, 255.255,
256
+ 276.072, 298.126, 321.465, 346.136])
257
+
258
+ bw_min = bandwidth[0] # minimum critical bandwidth
259
+
260
+ # Set up the critical band filters.
261
+ # Note here that Gaussianly shaped filters are used.
262
+ # Also, the sum of the filter weights are equivalent for each critical band filter.
263
+ # Filter less than -30 dB and set to zero.
264
+ min_factor = np.exp(-30.0 / (2.0 * 2.303)) # -30 dB point of filter
265
+ crit_filter = np.empty((num_crit, n_fftby2))
266
+ for i in range(num_crit):
267
+ f0 = (cent_freq[i] / max_freq) * n_fftby2
268
+ bw = (bandwidth[i] / max_freq) * n_fftby2
269
+ norm_factor = np.log(bw_min) - np.log(bandwidth[i])
270
+ j = np.arange(n_fftby2)
271
+ crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
272
+ cond = np.greater(crit_filter[i, :], min_factor)
273
+ crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
274
+ # For each frame of input speech, calculate the Weighted Spectral Slope Measure
275
+ num_frames = int(clean_length / skiprate - (winlength / skiprate)) # number of frames
276
+ start = 0 # starting sample
277
+ window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
278
+
279
+ distortion = np.empty(num_frames)
280
+ for frame_count in range(num_frames):
281
+ # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
282
+ clean_frame = clean_speech[start: start + winlength] / 32768
283
+ processed_frame = processed_speech[start: start + winlength] / 32768
284
+ clean_frame = np.multiply(clean_frame, window)
285
+ processed_frame = np.multiply(processed_frame, window)
286
+ # (2) Compute the Power Spectrum of Clean and Processed
287
+ # if USE_FFT_SPECTRUM:
288
+ clean_spec = np.square(np.abs(fft(clean_frame, n_fft)))
289
+ processed_spec = np.square(np.abs(fft(processed_frame, n_fft)))
290
+
291
+ # (3) Compute Filterbank Output Energies (in dB scale)
292
+ clean_energy = np.matmul(crit_filter, clean_spec[0:n_fftby2])
293
+ processed_energy = np.matmul(crit_filter, processed_spec[0:n_fftby2])
294
+
295
+ clean_energy = 10 * np.log10(np.maximum(clean_energy, 1E-10))
296
+ processed_energy = 10 * np.log10(np.maximum(processed_energy, 1E-10))
297
+
298
+ # (4) Compute Spectral Slope (dB[i+1]-dB[i])
299
+ clean_slope = clean_energy[1:num_crit] - clean_energy[0: num_crit - 1]
300
+ processed_slope = processed_energy[1:num_crit] - processed_energy[0: num_crit - 1]
301
+
302
+ # (5) Find the nearest peak locations in the spectra to each critical band.
303
+ # If the slope is negative, we search to the left. If positive, we search to the right.
304
+ clean_loc_peak = np.empty(num_crit - 1)
305
+ processed_loc_peak = np.empty(num_crit - 1)
306
+
307
+ for i in range(num_crit - 1):
308
+ # find the peaks in the clean speech signal
309
+ if clean_slope[i] > 0: # search to the right
310
+ n = i
311
+ while (n < num_crit - 1) and (clean_slope[n] > 0):
312
+ n = n + 1
313
+ clean_loc_peak[i] = clean_energy[n - 1]
314
+ else: # search to the left
315
+ n = i
316
+ while (n >= 0) and (clean_slope[n] <= 0):
317
+ n = n - 1
318
+ clean_loc_peak[i] = clean_energy[n + 1]
319
+
320
+ # find the peaks in the processed speech signal
321
+ if processed_slope[i] > 0: # search to the right
322
+ n = i
323
+ while (n < num_crit - 1) and (processed_slope[n] > 0):
324
+ n = n + 1
325
+ processed_loc_peak[i] = processed_energy[n - 1]
326
+ else: # search to the left
327
+ n = i
328
+ while (n >= 0) and (processed_slope[n] <= 0):
329
+ n = n - 1
330
+ processed_loc_peak[i] = processed_energy[n + 1]
331
+
332
+ # (6) Compute the WSS Measure for this frame. This includes determination of the weighting function.
333
+ dBMax_clean = np.max(clean_energy)
334
+ dBMax_processed = np.max(processed_energy)
335
+ '''
336
+ The weights are calculated by averaging individual weighting factors from the clean and processed frame.
337
+ These weights W_clean and W_processed should range from 0 to 1 and place more emphasis on spectral peaks
338
+ and less emphasis on slope differences in spectral valleys.
339
+ This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
340
+ '''
341
+ Wmax_clean = np.divide(Kmax, Kmax + dBMax_clean - clean_energy[0: num_crit - 1])
342
+ Wlocmax_clean = np.divide(Klocmax, Klocmax + clean_loc_peak - clean_energy[0: num_crit - 1])
343
+ W_clean = np.multiply(Wmax_clean, Wlocmax_clean)
344
+
345
+ Wmax_processed = np.divide(Kmax, Kmax + dBMax_processed - processed_energy[0: num_crit - 1])
346
+ Wlocmax_processed = np.divide(Klocmax, Klocmax + processed_loc_peak - processed_energy[0: num_crit - 1])
347
+ W_processed = np.multiply(Wmax_processed, Wlocmax_processed)
348
+
349
+ W = np.divide(np.add(W_clean, W_processed), 2.0)
350
+ slope_diff = np.subtract(clean_slope, processed_slope)[0: num_crit - 1]
351
+ distortion[frame_count] = np.dot(W, np.square(slope_diff)) / np.sum(W)
352
+ # this normalization is not part of Klatt's paper, but helps to normalize the measure.
353
+ # Here we scale the measure by the sum of the weights.
354
+ start = start + skiprate
355
+ return distortion
356
+
357
+
358
+ def calc_speech_metrics(hypothesis: np.ndarray,
359
+ reference: np.ndarray) -> tuple[float, int, int, int, float]:
360
+ """
361
+ Calculate speech metrics pesq_mos, CSIG, CBAK, COVL, segSNR. These are all related and thus included
362
+ in one function. Reference: matlab script "compute_metrics.m".
363
+
364
+ Usage:
365
+ pesq, csig, cbak, covl, ssnr = compute_metrics(hypothesis, reference, Fs, path)
366
+ reference: clean audio as array
367
+ hypothesis: enhanced audio as array
368
+ Audio must have sampling rate = 16000 Hz.
369
+
370
+ Example call:
371
+ pesq_output, csig_output, cbak_output, covl_output, ssnr_output = \
372
+ calc_speech_metrics(predicted_audio, target_audio)
373
+ """
374
+ from sonusai.metrics import calc_pesq
375
+
376
+ Fs = 16000
377
+
378
+ # compute the WSS measure
379
+ wss_dist_vec = wss(reference, hypothesis, Fs)
380
+ wss_dist_vec = np.sort(wss_dist_vec)
381
+ alpha = 0.95 # value from CMGAN ref implementation
382
+ wss_dist = np.mean(wss_dist_vec[0: round(np.size(wss_dist_vec) * alpha)])
383
+
384
+ # compute the LLR measure
385
+ llr_dist = llr(reference, hypothesis, Fs)
386
+ ll_rs = np.sort(llr_dist)
387
+ llr_len = round(np.size(llr_dist) * alpha)
388
+ llr_mean = np.mean(ll_rs[0: llr_len])
389
+
390
+ # compute the SNRseg
391
+ snr_dist, segsnr_dist = snr(reference, hypothesis, Fs)
392
+ snr_mean = snr_dist
393
+ segSNR = np.mean(segsnr_dist)
394
+
395
+ # compute the pesq (use Sonusai wrapper, only fs=16k, mode=wb support)
396
+ pesq_mos = calc_pesq(hypothesis=hypothesis, reference=reference)
397
+ # pesq_mos = pesq(sampling_rate1, data1, data2, 'wb')
398
+
399
+ # now compute the composite measures
400
+ CSIG = 3.093 - 1.029 * llr_mean + 0.603 * pesq_mos - 0.009 * wss_dist
401
+ CSIG = max(1, CSIG)
402
+ CSIG = min(5, CSIG) # limit values to [1, 5]
403
+ CBAK = 1.634 + 0.478 * pesq_mos - 0.007 * wss_dist + 0.063 * segSNR
404
+ CBAK = max(1, CBAK)
405
+ CBAK = min(5, CBAK) # limit values to [1, 5]
406
+ COVL = 1.594 + 0.805 * pesq_mos - 0.512 * llr_mean - 0.007 * wss_dist
407
+ COVL = max(1, COVL)
408
+ COVL = min(5, COVL) # limit values to [1, 5]
409
+
410
+ return pesq_mos, CSIG, CBAK, COVL, segSNR
411
+
412
+
114
413
  def mean_square_error(hypothesis: np.ndarray,
115
414
  reference: np.ndarray,
116
415
  squared: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
@@ -191,31 +490,43 @@ def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray
191
490
  return err, err_b, err_f
192
491
 
193
492
 
194
- def phase_distance(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
195
- """Calculate weighted phase distance error
493
+ def phase_distance(reference: np.ndarray,
494
+ hypothesis: np.ndarray,
495
+ eps: float = 1e-9) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
496
+ """Calculate weighted phase distance error (weight normalization over bins per frame)
196
497
 
197
498
  :param reference: complex [frames, bins]
198
499
  :param hypothesis: complex [frames, bins]
500
+ :param eps: epsilon value
199
501
  :return: mean, mean per bin, mean per frame
200
502
  """
201
- reference_mag = np.abs(reference)
202
- # rh_angle_diff = np.angle(reference) - np.angle(hypothesis)
203
- rh_angle_diff = np.angle(reference / (hypothesis + 1e-7)) * 180 / np.pi # angle diff +/-180
503
+ ang_diff = np.angle(reference) - np.angle(hypothesis)
504
+ phd_mod = (ang_diff + np.pi) % (2 * np.pi) - np.pi
505
+ rh_angle_diff = phd_mod * 180 / np.pi # angle diff in deg
506
+
507
+ # Use complex divide to intrinsically keep angle diff +/-180 deg, but avoid div by zero (real hyp)
508
+ # hyp_real = np.real(hypothesis)
509
+ # near_zeros = np.real(hyp_real) < eps
510
+ # hyp_real = hyp_real * (np.logical_not(near_zeros))
511
+ # hyp_real = hyp_real + (near_zeros * eps)
512
+ # hypothesis = hyp_real + 1j*np.imag(hypothesis)
513
+ # rh_angle_diff = np.angle(reference / hypothesis) * 180 / np.pi # angle diff +/-180
204
514
 
205
515
  # weighted mean over all (scalar)
206
- ref_weight = reference_mag / (np.sum(reference_mag) + 1e-7) # frames x bins
516
+ reference_mag = np.abs(reference)
517
+ ref_weight = reference_mag / (np.sum(reference_mag) + eps) # frames x bins
207
518
  err = np.around(np.sum(ref_weight * rh_angle_diff), 3)
208
519
 
209
520
  # weighted mean over frames (value per bin)
210
521
  err_b = np.zeros(reference.shape[1])
211
522
  for bi in range(reference.shape[1]):
212
- ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) + 1e-7)
523
+ ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) + eps)
213
524
  err_b[bi] = np.around(np.sum(ref_weight * rh_angle_diff[:, bi]), 3)
214
525
 
215
526
  # weighted mean over bins (value per frame)
216
527
  err_f = np.zeros(reference.shape[0])
217
528
  for fi in range(reference.shape[0]):
218
- ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) + 1e-7)
529
+ ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) + eps)
219
530
  err_f[fi] = np.around(np.sum(ref_weight * rh_angle_diff[fi, :]), 3)
220
531
 
221
532
  return err, err_b, err_f
@@ -228,6 +539,7 @@ def plot_mixpred(mixture: AudioT,
228
539
  predict: Optional[Predict] = None,
229
540
  tp_title: str = '') -> plt.Figure:
230
541
  from sonusai.mixture import SAMPLE_RATE
542
+
231
543
  num_plots = 2
232
544
  if feature is not None:
233
545
  num_plots += 1
@@ -268,8 +580,8 @@ def plot_mixpred(mixture: AudioT,
268
580
 
269
581
 
270
582
  def plot_pdb_predtruth(predict: np.ndarray,
271
- truth_f: np.ndarray | None = None,
272
- metric: np.ndarray | None = None,
583
+ truth_f: Optional[np.ndarray] = None,
584
+ metric: Optional[np.ndarray] = None,
273
585
  tp_title: str = '') -> plt.Figure:
274
586
  """Plot predict and optionally truth and a metric in power db, e.g. applies 10*log10(predict)"""
275
587
  num_plots = 2
@@ -320,9 +632,9 @@ def plot_pdb_predtruth(predict: np.ndarray,
320
632
 
321
633
  def plot_epredtruth(predict: np.ndarray,
322
634
  predict_wav: np.ndarray,
323
- truth_f: np.ndarray | None = None,
324
- truth_wav: np.ndarray | None = None,
325
- metric: np.ndarray | None = None,
635
+ truth_f: Optional[np.ndarray] = None,
636
+ truth_wav: Optional[np.ndarray] = None,
637
+ metric: Optional[np.ndarray] = None,
326
638
  tp_title: str = '') -> plt.Figure:
327
639
  """Plot predict spectrogram and waveform and optionally truth and a metric)"""
328
640
  num_plots = 2
@@ -390,76 +702,91 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
390
702
  from os.path import splitext
391
703
 
392
704
  import h5py
393
- from matplotlib.backends.backend_pdf import PdfPages
705
+ from numpy import inf
706
+ from pystoi import stoi
394
707
 
395
708
  from sonusai import SonusAIError
709
+ from sonusai import logger
396
710
  from sonusai.metrics import calc_pcm
397
- from sonusai.metrics import calc_pesq
398
- from sonusai.metrics import calc_sa_sdr
399
711
  from sonusai.metrics import calc_wer
400
712
  from sonusai.metrics import calc_wsdr
713
+ from sonusai.mixture import forward_transform
401
714
  from sonusai.mixture import inverse_transform
715
+ from sonusai.mixture import read_audio
402
716
  from sonusai.utils import calc_asr
403
717
  from sonusai.utils import float_to_int16
404
718
  from sonusai.utils import reshape_outputs
719
+ from sonusai.utils import stack_complex
405
720
  from sonusai.utils import unstack_complex
406
721
  from sonusai.utils import write_wav
407
722
 
723
+ mixdb = MP_GLOBAL.mixdb
724
+ predict_location = MP_GLOBAL.predict_location
725
+ predwav_mode = MP_GLOBAL.predwav_mode
726
+ truth_est_mode = MP_GLOBAL.truth_est_mode
727
+ enable_plot = MP_GLOBAL.enable_plot
728
+ enable_wav = MP_GLOBAL.enable_wav
729
+ wer_method = MP_GLOBAL.wer_method
730
+ whisper_model = MP_GLOBAL.whisper_model
731
+
408
732
  # 1) Read predict data, var predict with shape [BatchSize,Classes] or [BatchSize,Tsteps,Classes]
409
- output_name = join(MP_GLOBAL.predict_location, MP_GLOBAL.mixdb.mixtures[mixid].name)
733
+ output_name = join(predict_location, mixdb.mixture(mixid).name)
410
734
  predict = None
411
- if not MP_GLOBAL.truth_est_mode:
412
- base_name = splitext(output_name)[0]
413
- try:
414
- with h5py.File(output_name, 'r') as f:
415
- predict = np.array(f['predict'])
416
- except Exception as e:
417
- raise SonusAIError(f'Error reading {output_name}: {e}')
418
- # reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
419
- if predict.ndim > 2: # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
420
- # logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
421
- predict, _ = reshape_outputs(predict=predict, timesteps=predict.shape[1])
422
- else:
423
- # in truth estimation mode we use the truth instead of prediction to see metrics with perfect input
424
- # so don't bother to read prediction and mark outputs with tru suffix, i.e. 0000_truest_*
735
+ if truth_est_mode:
736
+ # in truth estimation mode we use the truth in place of prediction to see metrics with perfect input
737
+ # don't bother to read prediction, and predict var will get assigned to truth later
738
+ # mark outputs with tru suffix, i.e. 0000_truest_*
425
739
  base_name = splitext(output_name)[0] + '_truest'
740
+ else:
741
+ base_name, ext = splitext(output_name) # base_name used later
742
+ if not predwav_mode:
743
+ try:
744
+ with h5py.File(output_name, 'r') as f:
745
+ predict = np.array(f['predict'])
746
+ except Exception as e:
747
+ raise SonusAIError(f'Error reading {output_name}: {e}')
748
+ # reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
749
+ if predict.ndim > 2: # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
750
+ # logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
751
+ predict, _ = reshape_outputs(predict=predict, truth=None, timesteps=predict.shape[1])
752
+ else:
753
+ base_name, ext = splitext(output_name)
754
+ prfname = join(base_name + '.wav')
755
+ audio = read_audio(prfname)
756
+ predict = forward_transform(audio, mixdb.ft_config)
757
+ if mixdb.feature[0:1] == 'h':
758
+ predict = power_compress(predict)
759
+ predict = stack_complex(predict)
426
760
 
427
761
  # 2) Collect true target, noise, mixture data, trim to predict size if needed
428
- target = MP_GLOBAL.mixdb.mixture_target(mixid)
429
- target_f = MP_GLOBAL.mixdb.mixture_target_f(mixid, target=target)
430
- noise = MP_GLOBAL.mixdb.mixture_noise(mixid)
431
- noise_f = MP_GLOBAL.mixdb.mixture_noise_f(mixid, noise=noise)
432
- mixture = MP_GLOBAL.mixdb.mixture_mixture(mixid, target=target, noise=noise)
433
- mixture_f = MP_GLOBAL.mixdb.mixture_mixture_f(mixid, mixture=mixture)
434
- segsnr_f = MP_GLOBAL.mixdb.mixture_segsnr(mixid, target=target, noise=noise)
435
- segsnr_f[segsnr_f == np.inf] = 7.944e8 # 99dB
436
- segsnr_f[segsnr_f == -np.inf] = 1.258e-10 # -99dB
762
+ target = mixdb.mixture_target(mixid)
763
+ target_f = mixdb.mixture_target_f(mixid, target=target)
764
+ noise = mixdb.mixture_noise(mixid)
765
+ noise_f = mixdb.mixture_noise_f(mixid, noise=noise)
766
+ mixture = mixdb.mixture_mixture(mixid, target=target, noise=noise)
767
+ mixture_f = mixdb.mixture_mixture_f(mixid, mixture=mixture)
768
+ segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)
769
+ segsnr_f[segsnr_f == inf] = 7.944e8 # 99db
770
+ segsnr_f[segsnr_f == -inf] = 1.258e-10 # -99db
437
771
  # need to use inv-tf to match #samples & latency shift properties of predict inv tf
438
- targetfi = inverse_transform(target_f, MP_GLOBAL.mixdb.it_config)
439
- noisefi = inverse_transform(noise_f, MP_GLOBAL.mixdb.it_config)
440
- # mixturefi = inverse_transform(mixture_f, MP_GLOBAL.mixdb.it_config)
772
+ targetfi = inverse_transform(target_f, mixdb.it_config)
773
+ noisefi = inverse_transform(noise_f, mixdb.it_config)
774
+ # mixturefi = mixdb.inverse_transform(mixture_f)
441
775
 
442
776
  # gen feature, truth - note feature only used for plots
443
777
  # TBD parse truth_f for different formats and also multi-truth
444
- feature, truth_f = MP_GLOBAL.mixdb.mixture_ft(mixid, mixture=mixture)
445
- truth_type = MP_GLOBAL.mixdb.targets[MP_GLOBAL.mixdb.mixtures[mixid].target_id[0]].truth_settings[
446
- 0].function
778
+ feature, truth_f = mixdb.mixture_ft(mixid, mixture=mixture)
779
+ truth_type = mixdb.target(mixdb.mixture(mixid).target_id[0]).truth_settings[0].function
447
780
  if truth_type == 'target_mixture_f':
448
781
  half = truth_f.shape[-1] // 2
449
782
  truth_f = truth_f[..., :half] # extract target_f only
450
783
 
451
- if target_f.shape[0] != truth_f.shape[0]:
452
- raise SonusAIError(f'Error: mixture {mixid} does not have the same number of frames as truth, '
453
- f'{target_f.shape[0]} != {truth_f.shape[0]}')
454
-
455
- if not MP_GLOBAL.truth_est_mode:
784
+ if not truth_est_mode:
456
785
  if predict.shape[0] < target_f.shape[0]: # target_f, truth_f, mixture_f, etc. same size
457
786
  trimf = target_f.shape[0] - predict.shape[0]
458
- logger.debug(f'Warning: prediction {mixid} has fewer frames than true mixture, '
459
- f'{predict.shape[0]} < {target_f.shape[0]}'
460
- f'trimming {trimf} frames from all truth.')
787
+ logger.debug(f'Warning: prediction frames less than mixture, trimming {trimf} frames from all truth.')
461
788
  target_f = target_f[0:-trimf, :]
462
- targetfi = inverse_transform(target_f, MP_GLOBAL.mixdb.it_config)
789
+ targetfi, _ = inverse_transform(target_f, mixdb.it_config)
463
790
  trimt = target.shape[0] - targetfi.shape[0]
464
791
  target = target[0:-trimt]
465
792
  noise_f = noise_f[0:-trimf, :]
@@ -468,30 +795,29 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
468
795
  mixture = mixture[0:-trimt]
469
796
  truth_f = truth_f[0:-trimf, :]
470
797
  elif predict.shape[0] > target_f.shape[0]:
471
- raise SonusAIError(f'Error: prediction {mixid} has more frames than true mixture, '
472
- f'{predict.shape[0]} > {target_f.shape[0]}')
798
+ raise SonusAIError(
799
+ f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
473
800
 
474
801
  # 3) Extraction - format proper complex and wav estimates and truth (unstack, uncompress, inv tf, etc.)
475
- if MP_GLOBAL.truth_est_mode:
802
+ if truth_est_mode:
476
803
  predict = truth_f # substitute truth for the prediction (for test/debug)
477
804
  predict_complex = unstack_complex(predict) # unstack
478
805
  # if feat has compressed mag and truth does not, compress it
479
- if MP_GLOBAL.mixdb.feature[0:2] == 'hn' and MP_GLOBAL.mixdb.targets[0].truth_settings[0].function[
480
- 0:10] != 'targetcmpr':
806
+ if mixdb.feature[0:1] == 'h' and mixdb.target(1).truth_settings[0].function[0:10] != 'targetcmpr':
481
807
  predict_complex = power_compress(predict_complex) # from uncompressed truth
482
808
  else:
483
809
  predict_complex = unstack_complex(predict)
484
810
 
485
811
  truth_f_complex = unstack_complex(truth_f)
486
- if MP_GLOBAL.mixdb.feature[0:2] == 'hn': # if feat has compressed mag
812
+ if mixdb.feature[0:1] == 'h': # 'hn' or 'ha' or 'hd', etc.: # if feat has compressed mag
487
813
  # estimate noise in uncompressed-mag domain
488
814
  noise_est_complex = mixture_f - power_uncompress(predict_complex)
489
815
  predict_complex = power_uncompress(predict_complex) # uncompress if truth is compressed
490
816
  else: # cn, c8, ..
491
817
  noise_est_complex = mixture_f - predict_complex
492
818
 
493
- target_est_wav = inverse_transform(predict_complex, MP_GLOBAL.mixdb.it_config)
494
- noise_est_wav = inverse_transform(noise_est_complex, MP_GLOBAL.mixdb.it_config)
819
+ target_est_wav = inverse_transform(predict_complex, mixdb.it_config)
820
+ noise_est_wav = inverse_transform(noise_est_complex, mixdb.it_config)
495
821
 
496
822
  # 4) Metrics
497
823
  # Target/Speech logerr - PSD estimation accuracy symmetric mean log-spectral distortion
@@ -509,19 +835,25 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
509
835
  # Noise td logerr
510
836
  # lerr_nt, lerr_nt_bin, lerr_nt_frame = log_error(noisefi, noise_truth_est_audio)
511
837
 
512
- # SA-SDR (time-domain source-aggregated SDR)
838
+ # # SA-SDR (time-domain source-aggragated SDR)
513
839
  ytrue = np.concatenate((targetfi[:, np.newaxis], noisefi[:, np.newaxis]), axis=1)
514
840
  ypred = np.concatenate((target_est_wav[:, np.newaxis], noise_est_wav[:, np.newaxis]), axis=1)
515
- # note: w/o scale is more pessimistic number
516
- sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
841
+ # # note: w/o scale is more pessimistic number
842
+ # sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
843
+ target_stoi = stoi(targetfi, target_est_wav, 16000, extended=False)
517
844
 
518
845
  wsdr, wsdr_cc, wsdr_cw = calc_wsdr(hypothesis=ypred, reference=ytrue, with_log=True)
519
- logger.debug(f'mixid {mixid} wsdr: cw {wsdr_cw}, sum(cw) {np.sum(wsdr_cw)}, cc {wsdr_cc}')
846
+ # logger.debug(f'wsdr weight sum for mixid {mixid} = {np.sum(wsdr_cw)}.')
847
+ # logger.debug(f'wsdr cweights = {wsdr_cw}.')
848
+ # logger.debug(f'wsdr ccoefs for mixid {mixid} = {wsdr_cc}.')
520
849
 
521
850
  # Speech intelligibility measure - PESQ
522
- if int(MP_GLOBAL.mixdb.mixtures[mixid].snr) > -99:
523
- pesq_speech = calc_pesq(hypothesis=target_est_wav, reference=target)
524
- pesq_mixture = calc_pesq(hypothesis=mixture, reference=target)
851
+ if int(mixdb.mixture(mixid).snr) > -99:
852
+ # len = target_est_wav.shape[0]
853
+ pesq_speech, csig_tg, cbak_tg, covl_tg, sgsnr_tg = calc_speech_metrics(target_est_wav, targetfi)
854
+ pesq_mixture, csig_mx, cbak_mx, covl_mx, sgsnr_mx = calc_speech_metrics(mixture, target)
855
+ # pesq_speech_tst = calc_pesq(hypothesis=target_est_wav, reference=target)
856
+ # pesq_mixture_tst = calc_pesq(hypothesis=mixture, reference=target)
525
857
  # pesq improvement
526
858
  pesq_impr = pesq_speech - pesq_mixture
527
859
  # pesq improvement %
@@ -530,12 +862,18 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
530
862
  pesq_speech = 0
531
863
  pesq_mixture = 0
532
864
  pesq_impr_pc = np.float32(0)
865
+ csig_mx = 0
866
+ csig_tg = 0
867
+ cbak_mx = 0
868
+ cbak_tg = 0
869
+ covl_mx = 0
870
+ covl_tg = 0
533
871
 
534
872
  # Calc WER
535
- if MP_GLOBAL.wer_method == 'none':
536
- asr_tt = ''
537
- asr_mx = ''
538
- asr_tge = ''
873
+ asr_tt = ''
874
+ asr_mx = ''
875
+ asr_tge = ''
876
+ if wer_method == 'none' or mixdb.mixture(mixid).snr == -99: # noise only, ignore/reset target asr
539
877
  wer_mx = float('nan')
540
878
  wer_tge = float('nan')
541
879
  wer_pi = float('nan')
@@ -543,13 +881,11 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
543
881
  if MP_GLOBAL.mixdb.asr_manifests:
544
882
  asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0] # ignore mixup
545
883
  else:
546
- asr_tt = calc_asr(target, engine=MP_GLOBAL.wer_method,
547
- whisper_model_name=MP_GLOBAL.whisper_model).text # target truth
884
+ asr_tt = calc_asr(target, engine=wer_method, whisper_model_name=whisper_model).text # target truth
548
885
 
549
886
  if asr_tt:
550
- asr_mx = calc_asr(mixture, engine=MP_GLOBAL.wer_method, whisper_model_name=MP_GLOBAL.whisper_model).text
551
- asr_tge = calc_asr(target_est_wav, engine=MP_GLOBAL.wer_method,
552
- whisper_model_name=MP_GLOBAL.whisper_model).text
887
+ asr_mx = calc_asr(mixture, engine=wer_method, whisper_model=whisper_model).text
888
+ asr_tge = calc_asr(target_est_wav, engine=wer_method, whisper_model=whisper_model).text
553
889
 
554
890
  wer_mx = calc_wer(asr_mx, asr_tt).wer * 100 # mixture wer
555
891
  wer_tge = calc_wer(asr_tge, asr_tt).wer * 100 # target estimate wer
@@ -561,24 +897,21 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
561
897
  else:
562
898
  wer_pi = 100 * (wer_mx - wer_tge) / wer_mx
563
899
  else:
564
- if MP_GLOBAL.mixdb.mixtures[mixid].snr != -99:
565
- print(f'Warning: mixid {mixid} asr truth is empty, setting to 0% wer')
566
- asr_mx = ''
567
- asr_tge = ''
900
+ print(f'Warning: mixid {mixid} asr truth is empty, setting to 0% wer')
568
901
  wer_mx = float(0)
569
902
  wer_tge = float(0)
570
903
  wer_pi = float(0)
571
904
 
572
905
  # 5) Save per mixture metric results
573
906
  # Single row in table of scalar metrics per mixture
574
- mtable1_col = ['MXSNR', 'MXPESQ', 'PESQ', 'PESQi%', 'MXWER', 'WER', 'WERi%', 'WSDR', 'SASDR',
575
- 'PCM', 'SPLERR', 'NLERR', 'PD', 'SPFILE', 'NFILE']
576
- ti = MP_GLOBAL.mixdb.mixtures[mixid].target_id[0]
577
- ni = MP_GLOBAL.mixdb.mixtures[mixid].noise_id
578
- metr1 = [MP_GLOBAL.mixdb.mixtures[mixid].snr, pesq_mixture, pesq_speech, pesq_impr_pc, wer_mx, wer_tge, wer_pi,
579
- wsdr,
580
- sa_sdr, pcm, lerr_tg, lerr_n, phd, basename(MP_GLOBAL.mixdb.targets[ti].name),
581
- basename(MP_GLOBAL.mixdb.noises[ni].name)]
907
+ mtable1_col = ['MXSNR', 'MXPESQ', 'PESQ', 'PESQi%', 'MXWER', 'WER', 'WERi%', 'WSDR', 'STOI',
908
+ 'PCM', 'SPLERR', 'NLERR', 'PD', 'MXCSIG', 'CSIG', 'MXCBAK', 'CBAK', 'MXCOVL', 'COVL',
909
+ 'SPFILE', 'NFILE']
910
+ ti = mixdb.mixture(mixid).target_id[0]
911
+ ni = mixdb.mixture(mixid).noise_id
912
+ metr1 = [mixdb.mixture(mixid).snr, pesq_mixture, pesq_speech, pesq_impr_pc, wer_mx, wer_tge, wer_pi, wsdr,
913
+ target_stoi, pcm, lerr_tg, lerr_n, phd, csig_mx, csig_tg, cbak_mx, cbak_tg, covl_mx, covl_tg,
914
+ basename(mixdb.target(ti).name), basename(mixdb.noise(ni).name)]
582
915
  mtab1 = pd.DataFrame([metr1], columns=mtable1_col, index=[mixid])
583
916
 
584
917
  # Stats of per frame estimation metrics
@@ -588,7 +921,8 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
588
921
  'NLERR': lerr_n_frame,
589
922
  'SPD': phd_frame})
590
923
  metr2 = metr2.describe() # Use pandas stat function
591
- metr2['SSNR'][1:] = metr2['SSNR'][1:].apply(lambda x: 10 * np.log10(x)) # Change SSNR stats to dB, except count
924
+ metr2['SSNR'][1:] = metr2['SSNR'][1:].apply(
925
+ lambda x: 10 * np.log10(x + 1.01e-10)) # Change SSNR stats to dB, except count
592
926
  # create a single row in multi-column header
593
927
  new_labels = pd.MultiIndex.from_product([metr2.columns,
594
928
  ['Avg', 'Min', 'Med', 'Max', 'Std']],
@@ -597,7 +931,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
597
931
  mtab2 = pd.DataFrame(dat1row,
598
932
  index=[mixid],
599
933
  columns=new_labels)
600
- mtab2.insert(0, 'MXSNR', MP_GLOBAL.mixdb.mixtures[mixid].snr, False) # add MXSNR as the first metric column
934
+ mtab2.insert(0, 'MXSNR', mixdb.mixture(mixid).snr, False) # add MXSNR as the first metric column
601
935
 
602
936
  all_metrics_table_1 = mtab1 # return to be collected by process
603
937
  all_metrics_table_2 = mtab2 # return to be collected by process
@@ -610,41 +944,44 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
610
944
  print(f'Extraction statistics over {mixture_f.shape[0]} frames:', file=f)
611
945
  print(metr2.round(2).to_string(float_format=lambda x: "{:.2f}".format(x)), file=f)
612
946
  print('', file=f)
613
- print(f'Target path: {MP_GLOBAL.mixdb.targets[ti].name}', file=f)
614
- print(f'Noise path: {MP_GLOBAL.mixdb.noises[ni].name}', file=f)
615
- if MP_GLOBAL.wer_method != 'none':
616
- print(f'WER method: {MP_GLOBAL.wer_method} and whisper model (if used): {MP_GLOBAL.whisper_model}', file=f)
617
- if MP_GLOBAL.mixdb.asr_manifests:
947
+ print(f'Target path: {mixdb.target(ti).name}', file=f)
948
+ print(f'Noise path: {mixdb.noise(ni).name}', file=f)
949
+ if wer_method != 'none':
950
+ print(f'WER method: {wer_method} and whisper model (if used): {whisper_model}', file=f)
951
+ if mixdb.asr_manifests:
618
952
  print(f'ASR truth from manifest: {asr_tt}', file=f)
619
953
  else:
620
954
  print(f'ASR truth from wer method: {asr_tt}', file=f)
621
955
  print(f'ASR result for mixture: {asr_mx}', file=f)
622
956
  print(f'ASR result for prediction: {asr_tge}', file=f)
623
- # print(f'PESQ improvement: {pesq_impr:0.2f}, {pesq_impr_pc:0.1f}%', file=f)
957
+
958
+ print(f'Augmentations: {mixdb.mixture(mixid)}', file=f)
624
959
 
625
960
  # 7) write wav files
626
- if MP_GLOBAL.enable_wav:
961
+ if enable_wav:
627
962
  write_wav(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
628
963
  write_wav(name=base_name + '_target.wav', audio=float_to_int16(target))
964
+ # write_wav(name=base_name + '_targetfi.wav', audio=float_to_int16(targetfi))
629
965
  write_wav(name=base_name + '_noise.wav', audio=float_to_int16(noise))
630
966
  write_wav(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
631
967
  write_wav(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
632
968
 
633
969
  # debug code to test for perfect reconstruction of the extraction method
634
970
  # note both 75% olsa-hanns and 50% olsa-hann modes checked to have perfect reconstruction
635
- # target_r = inverse_transform(target_f, MP_GLOBAL.mixdb.it_config)
636
- # noise_r = inverse_transform(noise_f, MP_GLOBAL.mixdb.it_config)
971
+ # target_r = mixdb.inverse_transform(target_f)
972
+ # noise_r = mixdb.inverse_transform(noise_f)
637
973
  # _write_wav(name=base_name + '_target_r.wav', audio=float_to_int16(target_r))
638
974
  # _write_wav(name=base_name + '_noise_r.wav', audio=float_to_int16(noise_r)) # chk perfect rec
639
975
 
640
976
  # 8) Write out plot file
641
- if MP_GLOBAL.enable_plot:
977
+ if enable_plot:
978
+ from matplotlib.backends.backend_pdf import PdfPages
642
979
  plot_fname = base_name + '_metric_spenh.pdf'
643
980
 
644
981
  # Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
645
982
  # Original size (frames, stride, num_bands), decimates in stride dimension only if step is > 1
646
983
  # Reshape to get frames*decimated_stride, num_bands
647
- step = int(MP_GLOBAL.mixdb.feature_samples / MP_GLOBAL.mixdb.feature_step_samples)
984
+ step = int(mixdb.feature_samples / mixdb.feature_step_samples)
648
985
  if feature.ndim != 3:
649
986
  raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, num_bands')
650
987
 
@@ -656,18 +993,17 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
656
993
 
657
994
  with PdfPages(plot_fname) as pdf:
658
995
  # page1 we always have a mixture and prediction, target optional if truth provided
659
- tfunc_name = MP_GLOBAL.mixdb.targets[0].truth_settings[0].function # first target, assumes all have same
660
- match tfunc_name:
661
- case 'mapped_snr_f':
662
- # leave as unmapped snr
663
- predplot = predict
664
- tfunc_name = MP_GLOBAL.mixdb.targets[0].truth_settings[0].function
665
- case 'target_f' | 'target_mixture_f':
666
- predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
667
- case _:
668
- # use dB scale
669
- predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
670
- tfunc_name = tfunc_name + ' (db)'
996
+ tfunc_name = mixdb.target(1).truth_settings[0].function # first target, assumes all have same
997
+ if tfunc_name == 'mapped_snr_f':
998
+ # leave as unmapped snr
999
+ predplot = predict
1000
+ tfunc_name = mixdb.target(1).truth_settings[0].function
1001
+ elif tfunc_name == 'target_f' or 'target_mixture_f':
1002
+ predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
1003
+ else:
1004
+ # use dB scale
1005
+ predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
1006
+ tfunc_name = tfunc_name + ' (db)'
671
1007
 
672
1008
  mixspec = 20 * np.log10(abs(mixture_f) + np.finfo(np.float32).eps)
673
1009
  pdf.savefig(plot_mixpred(mixture=mixture,
@@ -710,7 +1046,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
710
1046
  return all_metrics_table_1, all_metrics_table_2
711
1047
 
712
1048
 
713
- def main() -> None:
1049
+ def main():
714
1050
  from docopt import docopt
715
1051
 
716
1052
  import sonusai
@@ -729,17 +1065,20 @@ def main() -> None:
729
1065
  truth_location = args['TLOC']
730
1066
  whisper_model = args['--whisper-model'].lower()
731
1067
 
732
- from glob import glob
1068
+ import glob
733
1069
  from os.path import basename
734
1070
  from os.path import isdir
735
1071
  from os.path import join
1072
+ from os.path import split
736
1073
 
737
1074
  from tqdm import tqdm
738
1075
 
739
1076
  from sonusai import create_file_handler
740
1077
  from sonusai import initial_log_messages
1078
+ from sonusai import logger
741
1079
  from sonusai import update_console_handler
742
- from sonusai.mixture import DEFAULT_SPEECH
1080
+ from sonusai.mixture import DEFAULT_NOISE
1081
+ from sonusai.mixture import MixtureDatabase
743
1082
  from sonusai.mixture import read_audio
744
1083
  from sonusai.utils import calc_asr
745
1084
  from sonusai.utils import pp_tqdm_imap
@@ -749,12 +1088,19 @@ def main() -> None:
749
1088
  print(f'The specified predict location {predict_location} is not a valid subdirectory path, exiting ...')
750
1089
 
751
1090
  # allpfiles = listdir(predict_location)
752
- allph5files = glob(predict_location + "/*.h5")
753
- predict_logfile = glob(predict_location + "/*predict.log")
754
- if len(allph5files) <= 0 and not truth_est_mode:
755
- print(f'Subdirectory {predict_location} has no files with .h5 extension, exiting ...')
1091
+ allpfiles = glob.glob(predict_location + "/*.h5")
1092
+ predict_logfile = glob.glob(predict_location + "/*predict.log")
1093
+ predwav_mode = False
1094
+ if len(allpfiles) <= 0 and not truth_est_mode:
1095
+ allpfiles = glob.glob(predict_location + "/*.wav") # check for wav files
1096
+ if len(allpfiles) <= 0:
1097
+ print(f'Subdirectory {predict_location} has no .h5 or .wav files, exiting ...')
1098
+ else:
1099
+ logger.info(f'Found {len(allpfiles)} prediction .wav files.')
1100
+ predwav_mode = True
756
1101
  else:
757
- logger.info(f'Found {len(allph5files)} prediction .h5 files.')
1102
+ logger.info(f'Found {len(allpfiles)} prediction .h5 files.')
1103
+
758
1104
  if len(predict_logfile) == 0:
759
1105
  logger.info(f'Warning, predict location {predict_location} has no prediction log files.')
760
1106
  else:
@@ -767,52 +1113,61 @@ def main() -> None:
767
1113
 
768
1114
  mixdb = MixtureDatabase(truth_location)
769
1115
  mixids = mixdb.mixids_to_list(mixids)
770
- logger.info(f'Found mixdb of {mixdb.num_mixtures} total mixtures, '
771
- f'with {mixdb.num_classes} classes in {truth_location}')
1116
+ logger.info(
1117
+ f'Found mixdb of {mixdb.num_mixtures} total mixtures, with {mixdb.num_classes} classes in {truth_location}')
772
1118
  logger.info(f'Only running specified subset of {len(mixids)} mixtures')
773
1119
 
774
1120
  enable_asr_warmup = False
775
- match wer_method:
776
- case 'none':
777
- fnb = 'metric_spenh_'
778
- case 'google':
779
- fnb = 'metric_spenh_ggl_'
780
- logger.info(f'WER enabled with method {wer_method}')
781
- enable_asr_warmup = True
782
- case 'deepgram':
783
- fnb = 'metric_spenh_dgram_'
784
- logger.info(f'WER enabled with method {wer_method}')
785
- enable_asr_warmup = True
786
- case 'aixplain_whisper':
787
- fnb = 'metric_spenh_whsp_' + whisper_model + '_'
788
- logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
789
- enable_asr_warmup = True
790
- case 'whisper':
791
- fnb = 'metric_spenh_whspl_' + whisper_model + '_'
792
- logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
793
- enable_asr_warmup = True
794
- case _:
795
- logger.error(f'Unrecognized WER method: {wer_method}')
796
- return
1121
+ if wer_method == 'none':
1122
+ fnb = 'metric_spenh_'
1123
+ elif wer_method == 'google':
1124
+ fnb = 'metric_spenh_ggl_'
1125
+ logger.info(f'WER enabled with method {wer_method}')
1126
+ enable_asr_warmup = True
1127
+ elif wer_method == 'deepgram':
1128
+ fnb = 'metric_spenh_dgram_'
1129
+ logger.info(f'WER enabled with method {wer_method}')
1130
+ enable_asr_warmup = True
1131
+ elif wer_method == 'aixplain_whisper':
1132
+ fnb = 'metric_spenh_whspx_' + whisper_model + '_'
1133
+ logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
1134
+ enable_asr_warmup = True
1135
+ elif wer_method == 'whisper':
1136
+ fnb = 'metric_spenh_whspl_' + whisper_model + '_'
1137
+ logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
1138
+ enable_asr_warmup = True
1139
+ elif wer_method == 'aaware_whisper':
1140
+ fnb = 'metric_spenh_whspaaw_' + whisper_model + '_'
1141
+ logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
1142
+ enable_asr_warmup = True
1143
+ elif wer_method == 'fastwhisper':
1144
+ fnb = 'metric_spenh_fwhsp_' + whisper_model + '_'
1145
+ logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
1146
+ enable_asr_warmup = True
1147
+ else:
1148
+ logger.error(f'Unrecognized WER method: {wer_method}')
1149
+ return
797
1150
 
798
1151
  if enable_asr_warmup:
1152
+ DEFAULT_SPEECH = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
799
1153
  audio = read_audio(DEFAULT_SPEECH)
800
- logger.info(f'Warming up ASR method, note for cloud service this could take up to a few minutes ...')
1154
+ logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
801
1155
  asr_chk = calc_asr(audio, engine=wer_method, whisper_model_name=whisper_model)
802
1156
  logger.info(f'Warmup completed, results {asr_chk}')
803
1157
 
804
- # Individual mixtures use pandas print, set precision to 2 decimal places
805
- # pd.set_option('float_format', '{:.2f}'.format)
806
1158
  MP_GLOBAL.mixdb = mixdb
807
1159
  MP_GLOBAL.predict_location = predict_location
1160
+ MP_GLOBAL.predwav_mode = predwav_mode
808
1161
  MP_GLOBAL.truth_est_mode = truth_est_mode
809
1162
  MP_GLOBAL.enable_plot = enable_plot
810
1163
  MP_GLOBAL.enable_wav = enable_wav
811
1164
  MP_GLOBAL.wer_method = wer_method
812
1165
  MP_GLOBAL.whisper_model = whisper_model
813
1166
 
814
- progress = tqdm(total=len(mixids))
815
- all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress)
1167
+ # Individual mixtures use pandas print, set precision to 2 decimal places
1168
+ # pd.set_option('float_format', '{:.2f}'.format)
1169
+ progress = tqdm(total=len(mixids), desc='calc_metric_spenh')
1170
+ all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, num_cpus=None)
816
1171
  progress.close()
817
1172
 
818
1173
  all_metrics_table_1 = pd.concat([item[0] for item in all_metrics_tables])
@@ -842,29 +1197,32 @@ def main() -> None:
842
1197
  mtab_snr_summary['PESQi%'] = 100 * (mtab_snr_summary['PESQ'] - mtab_snr_summary['MXPESQ']) / np.maximum(
843
1198
  mtab_snr_summary['MXPESQ'], 0.01)
844
1199
  for i in range(len(mtab_snr_summary)):
845
- tmp_mxwer = mtab_snr_summary['MXWER'].iloc[i]
846
- tmp_wer = mtab_snr_summary['WER'].iloc[i]
847
- if tmp_mxwer == 0.0:
848
- if tmp_wer == 0.0:
1200
+ if mtab_snr_summary['MXWER'].iloc[i] == 0.0:
1201
+ if mtab_snr_summary['WER'].iloc[i] == 0.0:
849
1202
  mtab_snr_summary['WERi%'].iloc[i] = 0.0
850
1203
  else:
851
1204
  mtab_snr_summary['WERi%'].iloc[i] = -999.0
852
1205
  else:
853
- mtab_snr_summary['WERi%'].iloc[i] = 100 * (tmp_mxwer - tmp_wer) / tmp_mxwer
1206
+ mtab_snr_summary['WERi%'].iloc[i] = 100 * (mtab_snr_summary['MXWER'].iloc[i] -
1207
+ mtab_snr_summary['WER'].iloc[i]) / \
1208
+ mtab_snr_summary['MXWER'].iloc[i]
854
1209
 
855
1210
  # Calculate avg metrics over all mixtures except -99
856
1211
  all_mtab1_sorted_nom99 = all_mtab1_sorted[all_mtab1_sorted.MXSNR != -99]
857
1212
  all_nom99_mean = all_mtab1_sorted_nom99.mean(numeric_only=True)
858
1213
 
859
1214
  # correct the percentage averages with a direct calculation (PESQ% and WER%):
860
- all_nom99_mean[3] = 100 * (all_nom99_mean[2] - all_nom99_mean[1]) / np.maximum(all_nom99_mean[1], 0.01) # pesq%
861
- if all_nom99_mean[4] == 0.0:
862
- if all_nom99_mean[5] == 0.0:
863
- all_nom99_mean[6] = 0.0
1215
+ # ser.iloc[pos]
1216
+ all_nom99_mean['PESQi%'] = (100 * (all_nom99_mean['PESQ'] - all_nom99_mean['MXPESQ'])
1217
+ / np.maximum(all_nom99_mean['MXPESQ'], 0.01)) # pesq%
1218
+ # all_nom99_mean[3] = 100 * (all_nom99_mean[2] - all_nom99_mean[1]) / np.maximum(all_nom99_mean[1], 0.01) # pesq%
1219
+ if all_nom99_mean['MXWER'] == 0.0:
1220
+ if all_nom99_mean['WER'] == 0.0:
1221
+ all_nom99_mean['WERi%'] = 0.0
864
1222
  else:
865
- all_nom99_mean[6] = -999.0
866
- else:
867
- all_nom99_mean[6] = 100 * (all_nom99_mean[4] - all_nom99_mean[5]) / all_nom99_mean[4] # wer%
1223
+ all_nom99_mean['WERi%'] = -999.0
1224
+ else: # wer%
1225
+ all_nom99_mean['WERi%'] = 100 * (all_nom99_mean['MXWER'] - all_nom99_mean['WER']) / all_nom99_mean['MXWER']
868
1226
 
869
1227
  num_mix = len(mixids)
870
1228
  if num_mix > 1:
@@ -32,7 +32,7 @@ class MixtureDatabaseDataset(Dataset):
32
32
  def __len__(self):
33
33
  return len(self.mixids)
34
34
 
35
- def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray]:
35
+ def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray, int]:
36
36
  """Get data from one mixture
37
37
  """
38
38
  import random
@@ -68,7 +68,7 @@ class MixtureDatabaseDataset(Dataset):
68
68
  feature = feature[start:start + self.cut_len]
69
69
  truth = truth[start:start + self.cut_len]
70
70
 
71
- return feature, truth
71
+ return feature, truth, idx
72
72
 
73
73
 
74
74
  class AawareDataLoader(DataLoader):
sonusai/mixture/audio.py CHANGED
@@ -59,6 +59,7 @@ def get_duration(audio: AudioT) -> float:
59
59
  :return: Duration of audio in seconds
60
60
  """
61
61
  from .constants import SAMPLE_RATE
62
+
62
63
  return len(audio) / SAMPLE_RATE
63
64
 
64
65
 
@@ -66,14 +67,15 @@ def validate_input_file(input_filepath: str) -> None:
66
67
  from os.path import exists
67
68
  from os.path import splitext
68
69
 
69
- from torchaudio.utils.sox_utils import list_read_formats
70
+ from soundfile import available_formats
70
71
 
71
72
  from sonusai import SonusAIError
73
+
72
74
  if not exists(input_filepath):
73
75
  raise SonusAIError(f'input_filepath {input_filepath} does not exist.')
74
76
 
75
77
  ext = splitext(input_filepath)[1][1:].lower()
76
- read_formats = list_read_formats()
78
+ read_formats = [item.lower() for item in available_formats().keys()]
77
79
  if ext not in read_formats:
78
80
  raise SonusAIError(f'This installation of SoX cannot process .{ext} files')
79
81
 
@@ -86,6 +88,7 @@ def read_audio(name: Location) -> AudioT:
86
88
  :return: Array of time domain audio data
87
89
  """
88
90
  from .torchaudio_audio import read_torchaudio_audio
91
+
89
92
  return read_torchaudio_audio(name)
90
93
 
91
94
 
@@ -97,4 +100,5 @@ def read_ir(name: Location) -> ImpulseResponseData:
97
100
  :return: ImpulseResponseData object
98
101
  """
99
102
  from .torchaudio_audio import read_torchaudio_ir
103
+
100
104
  return read_torchaudio_ir(name)
@@ -21,7 +21,7 @@ def read_torchaudio_ir(name: Location) -> ImpulseResponseData:
21
21
 
22
22
  # Read impulse response data from audio file
23
23
  try:
24
- raw, sample_rate = torchaudio.load(expanded_name)
24
+ raw, sample_rate = torchaudio.load(expanded_name, backend='soundfile')
25
25
  except Exception as e:
26
26
  if name != expanded_name:
27
27
  raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}): {e}')
@@ -58,7 +58,7 @@ def read_torchaudio_audio(name: Location) -> AudioT:
58
58
  expanded_name, _ = tokenized_expand(name)
59
59
 
60
60
  try:
61
- out, samplerate = torchaudio.load(expanded_name)
61
+ out, samplerate = torchaudio.load(expanded_name, backend='soundfile')
62
62
  out = torch.reshape(out[0, :], (1, out.size()[1]))
63
63
 
64
64
  if not samplerate == SAMPLE_RATE:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonusai
3
- Version: 0.14.0
3
+ Version: 0.14.2
4
4
  Summary: Framework for building deep neural network models for sound, speech, and voice AI
5
5
  Home-page: https://aaware.com
6
6
  License: GPL-3.0-only
@@ -30,9 +30,11 @@ Requires-Dist: paho-mqtt (>=1.6.1,<2.0.0)
30
30
  Requires-Dist: pandas (>=2.1.1,<3.0.0)
31
31
  Requires-Dist: pesq (>=0.0.4,<0.0.5)
32
32
  Requires-Dist: pyaaware (>=1.5.3,<2.0.0)
33
+ Requires-Dist: pystoi (>=0.3.3,<0.4.0)
33
34
  Requires-Dist: requests (>=2.31.0,<3.0.0)
34
35
  Requires-Dist: scikit-learn (>=1.3.1,<2.0.0)
35
36
  Requires-Dist: sh (>=2.0.6,<3.0.0)
37
+ Requires-Dist: soundfile (>=0.12.1,<0.13.0)
36
38
  Requires-Dist: sox (>=1.4.1,<2.0.0)
37
39
  Requires-Dist: speechrecognition (>=3.10.0,<4.0.0)
38
40
  Requires-Dist: sqlalchemy[mypy] (>=2.0.22,<3.0.0)
@@ -1,6 +1,6 @@
1
1
  sonusai/__init__.py,sha256=KmIJ9wni9d9v5pyu0pUxbacZIHGkAywB9CJwl7JME28,1526
2
2
  sonusai/aawscd_probwrite.py,sha256=GukR5owp_0A3DrqSl9fHWULYgclNft4D5OkHIwfxxkc,3698
3
- sonusai/calc_metric_spenh.py,sha256=3RJnSjhh8ZywRkGsYKfaww7YAXdP_R27m5Bn7EkbIX8,44297
3
+ sonusai/calc_metric_spenh.py,sha256=cE5lexBq6nZHY7-zudqsMsoz5fFYqVAWgKk21dIlHSw,60810
4
4
  sonusai/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  sonusai/data/genmixdb.yml,sha256=6C1GUr_0P5_hEAYSn0MLAqoSzDk_rP8TyV0sVMZqz1Q,16233
6
6
  sonusai/data/speech_ma01_01.wav,sha256=PK0vMKg-NR6rPE3KouxHGF6PKXnJCr7AwjMqfu98LUA,76644
@@ -8,7 +8,7 @@ sonusai/data/whitenoise.wav,sha256=I2umov0m34y56F9IsIBi1XtE76ZeZaSKDf70cJRe3pI,1
8
8
  sonusai/data_generator/__init__.py,sha256=ouCpY5EDV35fKFeKGQfIcU8uE-c3QcuNerTxUA1X5L8,232
9
9
  sonusai/data_generator/dataset_from_mixdb.py,sha256=4eQjyZ2TM2FVgbS9Cy8nevfYMBaIyrmHtUiQzJN19Do,5469
10
10
  sonusai/data_generator/keras_from_mixdb.py,sha256=V5CUsGz-akIYdgQy9ABxwNKMYKv01klA4xtMDveF6uI,6167
11
- sonusai/data_generator/torch_from_mixdb.py,sha256=N5PFpmXJFUgewUUdGaUl1VPP95W1CYuxIWiKOe8Y56g,4269
11
+ sonusai/data_generator/torch_from_mixdb.py,sha256=lvEe9DDu_rIaoyhv9PW4UAnAWp5N74L8kRfxUhsh7oo,4279
12
12
  sonusai/evaluate.py,sha256=OH9g3l8yD4X-HHUf-qQriznXQJSW0gtf7XO4P-jbo1U,10025
13
13
  sonusai/genft.py,sha256=CnBiQKHQHZMlrq-F1QQJfpw-_45uhyut8cY-O7oDrTk,5557
14
14
  sonusai/genmix.py,sha256=l3n-vvSDtwIvYNw9Ulkn5fgAeoyh7reQgGE4Vvth334,7016
@@ -33,7 +33,7 @@ sonusai/metrics/confusion_matrix_summary.py,sha256=3qg6TMKjJeHtNjj2YnNjPFSlMrQXt
33
33
  sonusai/metrics/one_hot.py,sha256=lq58zKw0X9sdhJYGEldAkxPFqP3UOYpG_KdxkGHF_3c,13540
34
34
  sonusai/metrics/snr_summary.py,sha256=P4U5_Xr7v9F8kF-rZBnpsVNt3p42rIVS6zmch8yfVfg,5575
35
35
  sonusai/mixture/__init__.py,sha256=xlGw2FXoMZm2ra97GVfpJ-OTOp10d4dly8AXe8eJwhI,5294
36
- sonusai/mixture/audio.py,sha256=13zBg-ix3SC7xzFFX9WcGUuWowodRVI6pryWRmH7YmY,3221
36
+ sonusai/mixture/audio.py,sha256=3pat-AIG_FXiGr3aPRa7DSLzolH3PodVDtve-xUuXfk,3242
37
37
  sonusai/mixture/augmentation.py,sha256=HwYUJCSmRBWhdnzqKz5zZnMANT83GzJkDrPcWUm6jbg,10884
38
38
  sonusai/mixture/class_count.py,sha256=27YDu1puarhp7Rd4EYWGJ-FHP8rAYGd55I6abGqCscY,988
39
39
  sonusai/mixture/config.py,sha256=QrasMP-2NGocse2rF_oYkRluDDPo-czFLDEwKtQ8A54,23629
@@ -50,7 +50,7 @@ sonusai/mixture/spectral_mask.py,sha256=qHR2DBpbtz4u1o9sdFMRsUDVUjbof_MRKPW8uY4R
50
50
  sonusai/mixture/target_class_balancing.py,sha256=P3gLe2SFos5_N2LWiVFwD-fa_imZH2f1qBiI55BeqXI,4768
51
51
  sonusai/mixture/targets.py,sha256=n7PenQuU0pPM_LLXJHmUZ3VeSGDEk7Kdf8y473Xdm6Q,7395
52
52
  sonusai/mixture/tokenized_shell_vars.py,sha256=gCxw8SQUcal6mqWKF7hOBTgSQmbJUk1nT0Gn3H8GA0U,4705
53
- sonusai/mixture/torchaudio_audio.py,sha256=HL11-1_UK9QuvCP17cAnlfsBLZzR0aqLif7gRexxySM,2323
53
+ sonusai/mixture/torchaudio_audio.py,sha256=qMYXeOSI8U8zaT9x0knPg1dHWzYmswZk7oFGAMG0Jks,2365
54
54
  sonusai/mixture/torchaudio_augmentation.py,sha256=LrG19X71UYKMr69WNgJs2R4OTt1QBYu_h8WL5a4ERyE,4462
55
55
  sonusai/mixture/truth.py,sha256=Is-nqLXIBM7wjYbS6yzy8mnR8JqxwSabnVHsza0rh_E,1427
56
56
  sonusai/mixture/truth_functions/__init__.py,sha256=82lKYHhLy8KW3gHngrocoqwupGVLVsWdIXdYs3vhjOc,359
@@ -114,7 +114,7 @@ sonusai/utils/trim_docstring.py,sha256=dSrtiRsEN4wkkvKBp6WDr13RUypfqZzgH_jOBLs1o
114
114
  sonusai/utils/wave.py,sha256=OZe8iVLbKSFv_GdQzLD9hJdBiqimK4FxJ0lVoDbbiqQ,572
115
115
  sonusai/utils/yes_or_no.py,sha256=eMLXBVH0cEahiXY4W2KNORmwNQ-ba10eRtldh0y4NYg,263
116
116
  sonusai/vars.py,sha256=m2AefF0m5bXWGXpJj8Pi42zWL2ydeEj7bkak3GrtMyM,940
117
- sonusai-0.14.0.dist-info/METADATA,sha256=DZPAjTdEyjACmrkTgbc0nrP7S-U-IXmqN363HbuCTvE,2736
118
- sonusai-0.14.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
119
- sonusai-0.14.0.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
120
- sonusai-0.14.0.dist-info/RECORD,,
117
+ sonusai-0.14.2.dist-info/METADATA,sha256=RR8bQ-ZUGFqZZJID86OMAAM6N0h7MYpfwJlDYf4t0v4,2819
118
+ sonusai-0.14.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
119
+ sonusai-0.14.2.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
120
+ sonusai-0.14.2.dist-info/RECORD,,