PyPI - sonusai - Versions diffs - 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl - Mend

sonusai 0.14.0py3-none-any.whl → 0.14.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

sonusai/calc_metric_spenh.py CHANGED Viewed

@@ -13,7 +13,7 @@ options:
     -e WER, --wer-method WER    Word-Error-Rate method: deepgram, google, aixplain_whisper
                                 or whisper (locally run) [default: none]
     -m WMNAME, --whisper-model  Whisper model name used in aixplain_whisper and whisper WER methods.
-                                [default: small_en]
+                                [default: tiny]
 Calculate speech enhancement metrics of prediction data in PLOC using SonusAI mixture data
 in TLOC as truth/label reference. Metric and extraction data files are written into PLOC.
@@ -83,6 +83,7 @@ matplotlib.use('SVG')
 class MPGlobal:
     mixdb: MixtureDatabase = None
     predict_location: Location = None
+    predwav_mode: bool = None
     truth_est_mode: bool = None
     enable_plot: bool = None
     enable_wav: bool = None
@@ -111,6 +112,304 @@ def power_uncompress(spec):
     return real_uncompress + 1j * imag_uncompress
+def snr(clean_speech, processed_speech, sample_rate):
+    # Check the length of the clean and processed speech. Must be the same.
+    clean_length = len(clean_speech)
+    processed_length = len(processed_speech)
+    if clean_length != processed_length:
+        raise ValueError('Both Speech Files must be same length.')
+    overall_snr = 10 * np.log10(np.sum(np.square(clean_speech)) / np.sum(np.square(clean_speech - processed_speech)))
+    # Global Variables
+    winlength = round(30 * sample_rate / 1000)  # window length in samples
+    skiprate = int(np.floor(winlength / 4))  # window skip in samples
+    MIN_SNR = -10  # minimum SNR in dB
+    MAX_SNR = 35  # maximum SNR in dB
+    # For each frame of input speech, calculate the Segmental SNR
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))  # number of frames
+    start = 0  # starting sample
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
+    segmental_snr = np.empty(num_frames)
+    EPS = np.spacing(1)
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
+        clean_frame = clean_speech[start:start + winlength]
+        processed_frame = processed_speech[start:start + winlength]
+        clean_frame = np.multiply(clean_frame, window)
+        processed_frame = np.multiply(processed_frame, window)
+        # (2) Compute the Segmental SNR
+        signal_energy = np.sum(np.square(clean_frame))
+        noise_energy = np.sum(np.square(clean_frame - processed_frame))
+        segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + EPS) + EPS)
+        segmental_snr[frame_count] = max(segmental_snr[frame_count], MIN_SNR)
+        segmental_snr[frame_count] = min(segmental_snr[frame_count], MAX_SNR)
+        start = start + skiprate
+    return overall_snr, segmental_snr
+def lpcoeff(speech_frame, model_order):
+    # (1) Compute Autocorrelation Lags
+    winlength = np.size(speech_frame)
+    R = np.empty(model_order + 1)
+    E = np.empty(model_order + 1)
+    for k in range(model_order + 1):
+        R[k] = np.dot(speech_frame[0:winlength - k], speech_frame[k: winlength])
+    # (2) Levinson-Durbin
+    a = np.ones(model_order)
+    a_past = np.empty(model_order)
+    rcoeff = np.empty(model_order)
+    E[0] = R[0]
+    for i in range(model_order):
+        a_past[0: i] = a[0: i]
+        sum_term = np.dot(a_past[0: i], R[i:0:-1])
+        rcoeff[i] = (R[i + 1] - sum_term) / E[i]
+        a[i] = rcoeff[i]
+        if i == 0:
+            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], rcoeff[i])
+        else:
+            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], rcoeff[i])
+        E[i + 1] = (1 - rcoeff[i] * rcoeff[i]) * E[i]
+    acorr = R
+    refcoeff = rcoeff
+    lpparams = np.concatenate((np.array([1]), -a))
+    return acorr, refcoeff, lpparams
+def llr(clean_speech, processed_speech, sample_rate):
+    from scipy.linalg import toeplitz
+    # Check the length of the clean and processed speech.  Must be the same.
+    clean_length = np.size(clean_speech)
+    processed_length = np.size(processed_speech)
+    if clean_length != processed_length:
+        raise ValueError('Both Speech Files must be same length.')
+    # Global Variables
+    winlength = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
+    skiprate = (np.floor(winlength / 4)).astype(int)  # window skip in samples
+    if sample_rate < 10000:
+        P = 10  # LPC Analysis Order
+    else:
+        P = 16  # this could vary depending on sampling frequency.
+    # For each frame of input speech, calculate the Log Likelihood Ratio
+    num_frames = int((clean_length - winlength) / skiprate)  # number of frames
+    start = 0  # starting sample
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
+    distortion = np.empty(num_frames)
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
+        clean_frame = clean_speech[start: start + winlength]
+        processed_frame = processed_speech[start: start + winlength]
+        clean_frame = np.multiply(clean_frame, window)
+        processed_frame = np.multiply(processed_frame, window)
+        # (2) Get the autocorrelation lags and LPC parameters used to compute the LLR measure.
+        R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
+        R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
+        # (3) Compute the LLR measure
+        numerator = np.dot(np.matmul(A_processed, toeplitz(R_clean)), A_processed)
+        denominator = np.dot(np.matmul(A_clean, toeplitz(R_clean)), A_clean)
+        distortion[frame_count] = np.log(numerator / denominator)
+        start = start + skiprate
+    return distortion
+def wss(clean_speech, processed_speech, sample_rate):
+    from scipy.fftpack import fft
+    # Check the length of the clean and processed speech, which must be the same.
+    clean_length = np.size(clean_speech)
+    processed_length = np.size(processed_speech)
+    if clean_length != processed_length:
+        raise ValueError('Files must have same length.')
+    # Global variables
+    winlength = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
+    skiprate = (np.floor(np.divide(winlength, 4))).astype(int)  # window skip in samples
+    max_freq = (np.divide(sample_rate, 2)).astype(int)  # maximum bandwidth
+    num_crit = 25  # number of critical bands
+    USE_FFT_SPECTRUM = 1  # defaults to 10th order LP spectrum
+    n_fft = (np.power(2, np.ceil(np.log2(2 * winlength)))).astype(int)
+    n_fftby2 = (np.multiply(0.5, n_fft)).astype(int)  # FFT size/2
+    Kmax = 20.0  # value suggested by Klatt, pg 1280
+    Klocmax = 1.0  # value suggested by Klatt, pg 1280
+    # Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
+    cent_freq = np.array([50.0000, 120.000, 190.000, 260.000, 330.000, 400.000, 470.000,
+                          540.000, 617.372, 703.378, 798.717, 904.128, 1020.38, 1148.30,
+                          1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 2211.08, 2446.71,
+                          2701.97, 2978.04, 3276.17, 3597.63])
+    bandwidth = np.array([70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000,
+                          77.3724, 86.0056, 95.3398, 105.411, 116.256, 127.914, 140.423,
+                          153.823, 168.154, 183.457, 199.776, 217.153, 235.631, 255.255,
+                          276.072, 298.126, 321.465, 346.136])
+    bw_min = bandwidth[0]  # minimum critical bandwidth
+    # Set up the critical band filters.
+    # Note here that Gaussianly shaped filters are used.
+    # Also, the sum of the filter weights are equivalent for each critical band filter.
+    # Filter less than -30 dB and set to zero.
+    min_factor = np.exp(-30.0 / (2.0 * 2.303))  # -30 dB point of filter
+    crit_filter = np.empty((num_crit, n_fftby2))
+    for i in range(num_crit):
+        f0 = (cent_freq[i] / max_freq) * n_fftby2
+        bw = (bandwidth[i] / max_freq) * n_fftby2
+        norm_factor = np.log(bw_min) - np.log(bandwidth[i])
+        j = np.arange(n_fftby2)
+        crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
+        cond = np.greater(crit_filter[i, :], min_factor)
+        crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
+    # For each frame of input speech, calculate the Weighted Spectral Slope Measure
+    num_frames = int(clean_length / skiprate - (winlength / skiprate))  # number of frames
+    start = 0  # starting sample
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
+    distortion = np.empty(num_frames)
+    for frame_count in range(num_frames):
+        # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
+        clean_frame = clean_speech[start: start + winlength] / 32768
+        processed_frame = processed_speech[start: start + winlength] / 32768
+        clean_frame = np.multiply(clean_frame, window)
+        processed_frame = np.multiply(processed_frame, window)
+        # (2) Compute the Power Spectrum of Clean and Processed
+        # if USE_FFT_SPECTRUM:
+        clean_spec = np.square(np.abs(fft(clean_frame, n_fft)))
+        processed_spec = np.square(np.abs(fft(processed_frame, n_fft)))
+        # (3) Compute Filterbank Output Energies (in dB scale)
+        clean_energy = np.matmul(crit_filter, clean_spec[0:n_fftby2])
+        processed_energy = np.matmul(crit_filter, processed_spec[0:n_fftby2])
+        clean_energy = 10 * np.log10(np.maximum(clean_energy, 1E-10))
+        processed_energy = 10 * np.log10(np.maximum(processed_energy, 1E-10))
+        # (4) Compute Spectral Slope (dB[i+1]-dB[i])
+        clean_slope = clean_energy[1:num_crit] - clean_energy[0: num_crit - 1]
+        processed_slope = processed_energy[1:num_crit] - processed_energy[0: num_crit - 1]
+        # (5) Find the nearest peak locations in the spectra to each critical band.
+        #     If the slope is negative, we search to the left. If positive, we search to the right.
+        clean_loc_peak = np.empty(num_crit - 1)
+        processed_loc_peak = np.empty(num_crit - 1)
+        for i in range(num_crit - 1):
+            # find the peaks in the clean speech signal
+            if clean_slope[i] > 0:  # search to the right
+                n = i
+                while (n < num_crit - 1) and (clean_slope[n] > 0):
+                    n = n + 1
+                clean_loc_peak[i] = clean_energy[n - 1]
+            else:  # search to the left
+                n = i
+                while (n >= 0) and (clean_slope[n] <= 0):
+                    n = n - 1
+                clean_loc_peak[i] = clean_energy[n + 1]
+            # find the peaks in the processed speech signal
+            if processed_slope[i] > 0:  # search to the right
+                n = i
+                while (n < num_crit - 1) and (processed_slope[n] > 0):
+                    n = n + 1
+                processed_loc_peak[i] = processed_energy[n - 1]
+            else:  # search to the left
+                n = i
+                while (n >= 0) and (processed_slope[n] <= 0):
+                    n = n - 1
+                processed_loc_peak[i] = processed_energy[n + 1]
+        # (6) Compute the WSS Measure for this frame. This includes determination of the weighting function.
+        dBMax_clean = np.max(clean_energy)
+        dBMax_processed = np.max(processed_energy)
+        '''
+        The weights are calculated by averaging individual weighting factors from the clean and processed frame.
+        These weights W_clean and W_processed should range from 0 to 1 and place more emphasis on spectral peaks
+        and less emphasis on slope differences in spectral valleys.
+        This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
+        '''
+        Wmax_clean = np.divide(Kmax, Kmax + dBMax_clean - clean_energy[0: num_crit - 1])
+        Wlocmax_clean = np.divide(Klocmax, Klocmax + clean_loc_peak - clean_energy[0: num_crit - 1])
+        W_clean = np.multiply(Wmax_clean, Wlocmax_clean)
+        Wmax_processed = np.divide(Kmax, Kmax + dBMax_processed - processed_energy[0: num_crit - 1])
+        Wlocmax_processed = np.divide(Klocmax, Klocmax + processed_loc_peak - processed_energy[0: num_crit - 1])
+        W_processed = np.multiply(Wmax_processed, Wlocmax_processed)
+        W = np.divide(np.add(W_clean, W_processed), 2.0)
+        slope_diff = np.subtract(clean_slope, processed_slope)[0: num_crit - 1]
+        distortion[frame_count] = np.dot(W, np.square(slope_diff)) / np.sum(W)
+        # this normalization is not part of Klatt's paper, but helps to normalize the measure.
+        # Here we scale the measure by the sum of the weights.
+        start = start + skiprate
+    return distortion
+def calc_speech_metrics(hypothesis: np.ndarray,
+                        reference: np.ndarray) -> tuple[float, int, int, int, float]:
+    """
+    Calculate speech metrics pesq_mos, CSIG, CBAK, COVL, segSNR.  These are all related and thus included
+    in one function. Reference: matlab script "compute_metrics.m".
+    Usage:
+        pesq, csig, cbak, covl, ssnr = compute_metrics(hypothesis, reference, Fs, path)
+        reference: clean audio as array
+        hypothesis: enhanced audio as array
+        Audio must have sampling rate = 16000 Hz.
+    Example call:
+        pesq_output, csig_output, cbak_output, covl_output, ssnr_output = \
+                calc_speech_metrics(predicted_audio, target_audio)
+    """
+    from sonusai.metrics import calc_pesq
+    Fs = 16000
+    # compute the WSS measure
+    wss_dist_vec = wss(reference, hypothesis, Fs)
+    wss_dist_vec = np.sort(wss_dist_vec)
+    alpha = 0.95  # value from CMGAN ref implementation
+    wss_dist = np.mean(wss_dist_vec[0: round(np.size(wss_dist_vec) * alpha)])
+    # compute the LLR measure
+    llr_dist = llr(reference, hypothesis, Fs)
+    ll_rs = np.sort(llr_dist)
+    llr_len = round(np.size(llr_dist) * alpha)
+    llr_mean = np.mean(ll_rs[0: llr_len])
+    # compute the SNRseg
+    snr_dist, segsnr_dist = snr(reference, hypothesis, Fs)
+    snr_mean = snr_dist
+    segSNR = np.mean(segsnr_dist)
+    # compute the pesq (use Sonusai wrapper, only fs=16k, mode=wb support)
+    pesq_mos = calc_pesq(hypothesis=hypothesis, reference=reference)
+    # pesq_mos = pesq(sampling_rate1, data1, data2, 'wb')
+    # now compute the composite measures
+    CSIG = 3.093 - 1.029 * llr_mean + 0.603 * pesq_mos - 0.009 * wss_dist
+    CSIG = max(1, CSIG)
+    CSIG = min(5, CSIG)  # limit values to [1, 5]
+    CBAK = 1.634 + 0.478 * pesq_mos - 0.007 * wss_dist + 0.063 * segSNR
+    CBAK = max(1, CBAK)
+    CBAK = min(5, CBAK)  # limit values to [1, 5]
+    COVL = 1.594 + 0.805 * pesq_mos - 0.512 * llr_mean - 0.007 * wss_dist
+    COVL = max(1, COVL)
+    COVL = min(5, COVL)  # limit values to [1, 5]
+    return pesq_mos, CSIG, CBAK, COVL, segSNR
 def mean_square_error(hypothesis: np.ndarray,
                       reference: np.ndarray,
                       squared: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
@@ -191,31 +490,43 @@ def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray
     return err, err_b, err_f
-def phase_distance(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Calculate weighted phase distance error
+def phase_distance(reference: np.ndarray,
+                   hypothesis: np.ndarray,
+                   eps: float = 1e-9) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Calculate weighted phase distance error (weight normalization over bins per frame)
     :param reference: complex [frames, bins]
     :param hypothesis: complex [frames, bins]
+    :param eps: epsilon value
     :return: mean, mean per bin, mean per frame
     """
-    reference_mag = np.abs(reference)
-    # rh_angle_diff = np.angle(reference) - np.angle(hypothesis)
-    rh_angle_diff = np.angle(reference / (hypothesis + 1e-7)) * 180 / np.pi  # angle diff +/-180
+    ang_diff = np.angle(reference) - np.angle(hypothesis)
+    phd_mod = (ang_diff + np.pi) % (2 * np.pi) - np.pi
+    rh_angle_diff = phd_mod * 180 / np.pi  # angle diff in deg
+    # Use complex divide to intrinsically keep angle diff +/-180 deg, but avoid div by zero (real hyp)
+    # hyp_real = np.real(hypothesis)
+    # near_zeros = np.real(hyp_real) < eps
+    # hyp_real = hyp_real * (np.logical_not(near_zeros))
+    # hyp_real = hyp_real + (near_zeros * eps)
+    # hypothesis = hyp_real + 1j*np.imag(hypothesis)
+    # rh_angle_diff = np.angle(reference / hypothesis) * 180 / np.pi  # angle diff +/-180
     # weighted mean over all (scalar)
-    ref_weight = reference_mag / (np.sum(reference_mag) + 1e-7)  # frames x bins
+    reference_mag = np.abs(reference)
+    ref_weight = reference_mag / (np.sum(reference_mag) + eps)  # frames x bins
     err = np.around(np.sum(ref_weight * rh_angle_diff), 3)
     # weighted mean over frames (value per bin)
     err_b = np.zeros(reference.shape[1])
     for bi in range(reference.shape[1]):
-        ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) + 1e-7)
+        ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) + eps)
         err_b[bi] = np.around(np.sum(ref_weight * rh_angle_diff[:, bi]), 3)
     # weighted mean over bins (value per frame)
     err_f = np.zeros(reference.shape[0])
     for fi in range(reference.shape[0]):
-        ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) + 1e-7)
+        ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) + eps)
         err_f[fi] = np.around(np.sum(ref_weight * rh_angle_diff[fi, :]), 3)
     return err, err_b, err_f
@@ -228,6 +539,7 @@ def plot_mixpred(mixture: AudioT,
                  predict: Optional[Predict] = None,
                  tp_title: str = '') -> plt.Figure:
     from sonusai.mixture import SAMPLE_RATE
     num_plots = 2
     if feature is not None:
         num_plots += 1
@@ -268,8 +580,8 @@ def plot_mixpred(mixture: AudioT,
 def plot_pdb_predtruth(predict: np.ndarray,
-                       truth_f: np.ndarray | None = None,
-                       metric: np.ndarray | None = None,
+                       truth_f: Optional[np.ndarray] = None,
+                       metric: Optional[np.ndarray] = None,
                        tp_title: str = '') -> plt.Figure:
     """Plot predict and optionally truth and a metric in power db, e.g. applies 10*log10(predict)"""
     num_plots = 2
@@ -320,9 +632,9 @@ def plot_pdb_predtruth(predict: np.ndarray,
 def plot_epredtruth(predict: np.ndarray,
                     predict_wav: np.ndarray,
-                    truth_f: np.ndarray | None = None,
-                    truth_wav: np.ndarray | None = None,
-                    metric: np.ndarray | None = None,
+                    truth_f: Optional[np.ndarray] = None,
+                    truth_wav: Optional[np.ndarray] = None,
+                    metric: Optional[np.ndarray] = None,
                     tp_title: str = '') -> plt.Figure:
     """Plot predict spectrogram and waveform and optionally truth and a metric)"""
     num_plots = 2
@@ -390,76 +702,91 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     from os.path import splitext
     import h5py
-    from matplotlib.backends.backend_pdf import PdfPages
+    from numpy import inf
+    from pystoi import stoi
     from sonusai import SonusAIError
+    from sonusai import logger
     from sonusai.metrics import calc_pcm
-    from sonusai.metrics import calc_pesq
-    from sonusai.metrics import calc_sa_sdr
     from sonusai.metrics import calc_wer
     from sonusai.metrics import calc_wsdr
+    from sonusai.mixture import forward_transform
     from sonusai.mixture import inverse_transform
+    from sonusai.mixture import read_audio
     from sonusai.utils import calc_asr
     from sonusai.utils import float_to_int16
     from sonusai.utils import reshape_outputs
+    from sonusai.utils import stack_complex
     from sonusai.utils import unstack_complex
     from sonusai.utils import write_wav
+    mixdb = MP_GLOBAL.mixdb
+    predict_location = MP_GLOBAL.predict_location
+    predwav_mode = MP_GLOBAL.predwav_mode
+    truth_est_mode = MP_GLOBAL.truth_est_mode
+    enable_plot = MP_GLOBAL.enable_plot
+    enable_wav = MP_GLOBAL.enable_wav
+    wer_method = MP_GLOBAL.wer_method
+    whisper_model = MP_GLOBAL.whisper_model
     # 1)  Read predict data, var predict with shape [BatchSize,Classes] or [BatchSize,Tsteps,Classes]
-    output_name = join(MP_GLOBAL.predict_location, MP_GLOBAL.mixdb.mixtures[mixid].name)
+    output_name = join(predict_location, mixdb.mixture(mixid).name)
     predict = None
-    if not MP_GLOBAL.truth_est_mode:
-        base_name = splitext(output_name)[0]
-        try:
-            with h5py.File(output_name, 'r') as f:
-                predict = np.array(f['predict'])
-        except Exception as e:
-            raise SonusAIError(f'Error reading {output_name}: {e}')
-        # reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
-        if predict.ndim > 2:  # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
-            # logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
-            predict, _ = reshape_outputs(predict=predict, timesteps=predict.shape[1])
-    else:
-        # in truth estimation mode we use the truth instead of prediction to see metrics with perfect input
-        # so don't bother to read prediction and mark outputs with tru suffix, i.e. 0000_truest_*
+    if truth_est_mode:
+        # in truth estimation mode we use the truth in place of prediction to see metrics with perfect input
+        # don't bother to read prediction, and predict var will get assigned to truth later
+        # mark outputs with tru suffix, i.e. 0000_truest_*
         base_name = splitext(output_name)[0] + '_truest'
+    else:
+        base_name, ext = splitext(output_name)  # base_name used later
+        if not predwav_mode:
+            try:
+                with h5py.File(output_name, 'r') as f:
+                    predict = np.array(f['predict'])
+            except Exception as e:
+                raise SonusAIError(f'Error reading {output_name}: {e}')
+            # reshape to always be [frames,classes] where ndim==3 case frames = batch * tsteps
+            if predict.ndim > 2:  # TBD generalize to somehow detect if timestep dim exists, some cases > 2 don't have
+                # logger.debug(f'Prediction reshape from {predict.shape} to remove timestep dimension.')
+                predict, _ = reshape_outputs(predict=predict, truth=None, timesteps=predict.shape[1])
+        else:
+            base_name, ext = splitext(output_name)
+            prfname = join(base_name + '.wav')
+            audio = read_audio(prfname)
+            predict = forward_transform(audio, mixdb.ft_config)
+            if mixdb.feature[0:1] == 'h':
+                predict = power_compress(predict)
+            predict = stack_complex(predict)
     # 2) Collect true target, noise, mixture data, trim to predict size if needed
-    target = MP_GLOBAL.mixdb.mixture_target(mixid)
-    target_f = MP_GLOBAL.mixdb.mixture_target_f(mixid, target=target)
-    noise = MP_GLOBAL.mixdb.mixture_noise(mixid)
-    noise_f = MP_GLOBAL.mixdb.mixture_noise_f(mixid, noise=noise)
-    mixture = MP_GLOBAL.mixdb.mixture_mixture(mixid, target=target, noise=noise)
-    mixture_f = MP_GLOBAL.mixdb.mixture_mixture_f(mixid, mixture=mixture)
-    segsnr_f = MP_GLOBAL.mixdb.mixture_segsnr(mixid, target=target, noise=noise)
-    segsnr_f[segsnr_f == np.inf] = 7.944e8  # 99dB
-    segsnr_f[segsnr_f == -np.inf] = 1.258e-10  # -99dB
+    target = mixdb.mixture_target(mixid)
+    target_f = mixdb.mixture_target_f(mixid, target=target)
+    noise = mixdb.mixture_noise(mixid)
+    noise_f = mixdb.mixture_noise_f(mixid, noise=noise)
+    mixture = mixdb.mixture_mixture(mixid, target=target, noise=noise)
+    mixture_f = mixdb.mixture_mixture_f(mixid, mixture=mixture)
+    segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)
+    segsnr_f[segsnr_f == inf] = 7.944e8  # 99db
+    segsnr_f[segsnr_f == -inf] = 1.258e-10  # -99db
     # need to use inv-tf to match #samples & latency shift properties of predict inv tf
-    targetfi = inverse_transform(target_f, MP_GLOBAL.mixdb.it_config)
-    noisefi = inverse_transform(noise_f, MP_GLOBAL.mixdb.it_config)
-    # mixturefi = inverse_transform(mixture_f, MP_GLOBAL.mixdb.it_config)
+    targetfi = inverse_transform(target_f, mixdb.it_config)
+    noisefi = inverse_transform(noise_f, mixdb.it_config)
+    # mixturefi = mixdb.inverse_transform(mixture_f)
     # gen feature, truth - note feature only used for plots
     # TBD parse truth_f for different formats and also multi-truth
-    feature, truth_f = MP_GLOBAL.mixdb.mixture_ft(mixid, mixture=mixture)
-    truth_type = MP_GLOBAL.mixdb.targets[MP_GLOBAL.mixdb.mixtures[mixid].target_id[0]].truth_settings[
-        0].function
+    feature, truth_f = mixdb.mixture_ft(mixid, mixture=mixture)
+    truth_type = mixdb.target(mixdb.mixture(mixid).target_id[0]).truth_settings[0].function
     if truth_type == 'target_mixture_f':
         half = truth_f.shape[-1] // 2
         truth_f = truth_f[..., :half]  # extract target_f only
-    if target_f.shape[0] != truth_f.shape[0]:
-        raise SonusAIError(f'Error: mixture {mixid} does not have the same number of frames as truth, '
-                           f'{target_f.shape[0]} != {truth_f.shape[0]}')
-    if not MP_GLOBAL.truth_est_mode:
+    if not truth_est_mode:
         if predict.shape[0] < target_f.shape[0]:  # target_f, truth_f, mixture_f, etc. same size
             trimf = target_f.shape[0] - predict.shape[0]
-            logger.debug(f'Warning: prediction {mixid} has fewer frames than true mixture, '
-                         f'{predict.shape[0]} < {target_f.shape[0]}'
-                         f'trimming {trimf} frames from all truth.')
+            logger.debug(f'Warning: prediction frames less than mixture, trimming {trimf} frames from all truth.')
             target_f = target_f[0:-trimf, :]
-            targetfi = inverse_transform(target_f, MP_GLOBAL.mixdb.it_config)
+            targetfi, _ = inverse_transform(target_f, mixdb.it_config)
             trimt = target.shape[0] - targetfi.shape[0]
             target = target[0:-trimt]
             noise_f = noise_f[0:-trimf, :]
@@ -468,30 +795,29 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
             mixture = mixture[0:-trimt]
             truth_f = truth_f[0:-trimf, :]
         elif predict.shape[0] > target_f.shape[0]:
-            raise SonusAIError(f'Error: prediction {mixid} has more frames than true mixture, '
-                               f'{predict.shape[0]} > {target_f.shape[0]}')
+            raise SonusAIError(
+                f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
     # 3) Extraction - format proper complex and wav estimates and truth (unstack, uncompress, inv tf, etc.)
-    if MP_GLOBAL.truth_est_mode:
+    if truth_est_mode:
         predict = truth_f  # substitute truth for the prediction (for test/debug)
         predict_complex = unstack_complex(predict)  # unstack
         # if feat has compressed mag and truth does not, compress it
-        if MP_GLOBAL.mixdb.feature[0:2] == 'hn' and MP_GLOBAL.mixdb.targets[0].truth_settings[0].function[
-                                                    0:10] != 'targetcmpr':
+        if mixdb.feature[0:1] == 'h' and mixdb.target(1).truth_settings[0].function[0:10] != 'targetcmpr':
             predict_complex = power_compress(predict_complex)  # from uncompressed truth
     else:
         predict_complex = unstack_complex(predict)
     truth_f_complex = unstack_complex(truth_f)
-    if MP_GLOBAL.mixdb.feature[0:2] == 'hn':  # if feat has compressed mag
+    if mixdb.feature[0:1] == 'h':  # 'hn' or 'ha' or 'hd', etc.:  # if feat has compressed mag
         # estimate noise in uncompressed-mag domain
         noise_est_complex = mixture_f - power_uncompress(predict_complex)
         predict_complex = power_uncompress(predict_complex)  # uncompress if truth is compressed
     else:  # cn, c8, ..
         noise_est_complex = mixture_f - predict_complex
-    target_est_wav = inverse_transform(predict_complex, MP_GLOBAL.mixdb.it_config)
-    noise_est_wav = inverse_transform(noise_est_complex, MP_GLOBAL.mixdb.it_config)
+    target_est_wav = inverse_transform(predict_complex, mixdb.it_config)
+    noise_est_wav = inverse_transform(noise_est_complex, mixdb.it_config)
     # 4) Metrics
     # Target/Speech logerr - PSD estimation accuracy symmetric mean log-spectral distortion
@@ -509,19 +835,25 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # Noise td logerr
     # lerr_nt, lerr_nt_bin, lerr_nt_frame = log_error(noisefi, noise_truth_est_audio)
-    # SA-SDR (time-domain source-aggregated SDR)
+    # # SA-SDR (time-domain source-aggragated SDR)
     ytrue = np.concatenate((targetfi[:, np.newaxis], noisefi[:, np.newaxis]), axis=1)
     ypred = np.concatenate((target_est_wav[:, np.newaxis], noise_est_wav[:, np.newaxis]), axis=1)
-    # note: w/o scale is more pessimistic number
-    sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
+    # # note: w/o scale is more pessimistic number
+    # sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
+    target_stoi = stoi(targetfi, target_est_wav, 16000, extended=False)
     wsdr, wsdr_cc, wsdr_cw = calc_wsdr(hypothesis=ypred, reference=ytrue, with_log=True)
-    logger.debug(f'mixid {mixid} wsdr: cw {wsdr_cw}, sum(cw) {np.sum(wsdr_cw)}, cc {wsdr_cc}')
+    # logger.debug(f'wsdr weight sum for mixid {mixid} = {np.sum(wsdr_cw)}.')
+    # logger.debug(f'wsdr cweights = {wsdr_cw}.')
+    # logger.debug(f'wsdr ccoefs for mixid {mixid} = {wsdr_cc}.')
     # Speech intelligibility measure - PESQ
-    if int(MP_GLOBAL.mixdb.mixtures[mixid].snr) > -99:
-        pesq_speech = calc_pesq(hypothesis=target_est_wav, reference=target)
-        pesq_mixture = calc_pesq(hypothesis=mixture, reference=target)
+    if int(mixdb.mixture(mixid).snr) > -99:
+        # len = target_est_wav.shape[0]
+        pesq_speech, csig_tg, cbak_tg, covl_tg, sgsnr_tg = calc_speech_metrics(target_est_wav, targetfi)
+        pesq_mixture, csig_mx, cbak_mx, covl_mx, sgsnr_mx = calc_speech_metrics(mixture, target)
+        # pesq_speech_tst = calc_pesq(hypothesis=target_est_wav, reference=target)
+        # pesq_mixture_tst = calc_pesq(hypothesis=mixture, reference=target)
         # pesq improvement
         pesq_impr = pesq_speech - pesq_mixture
         # pesq improvement %
@@ -530,12 +862,18 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         pesq_speech = 0
         pesq_mixture = 0
         pesq_impr_pc = np.float32(0)
+        csig_mx = 0
+        csig_tg = 0
+        cbak_mx = 0
+        cbak_tg = 0
+        covl_mx = 0
+        covl_tg = 0
     # Calc WER
-    if MP_GLOBAL.wer_method == 'none':
-        asr_tt = ''
-        asr_mx = ''
-        asr_tge = ''
+    asr_tt = ''
+    asr_mx = ''
+    asr_tge = ''
+    if wer_method == 'none' or mixdb.mixture(mixid).snr == -99:  # noise only, ignore/reset target asr
         wer_mx = float('nan')
         wer_tge = float('nan')
         wer_pi = float('nan')
@@ -543,13 +881,11 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         if MP_GLOBAL.mixdb.asr_manifests:
             asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0]  # ignore mixup
         else:
-            asr_tt = calc_asr(target, engine=MP_GLOBAL.wer_method,
-                              whisper_model_name=MP_GLOBAL.whisper_model).text  # target truth
+            asr_tt = calc_asr(target, engine=wer_method, whisper_model_name=whisper_model).text  # target truth
         if asr_tt:
-            asr_mx = calc_asr(mixture, engine=MP_GLOBAL.wer_method, whisper_model_name=MP_GLOBAL.whisper_model).text
-            asr_tge = calc_asr(target_est_wav, engine=MP_GLOBAL.wer_method,
-                               whisper_model_name=MP_GLOBAL.whisper_model).text
+            asr_mx = calc_asr(mixture, engine=wer_method, whisper_model=whisper_model).text
+            asr_tge = calc_asr(target_est_wav, engine=wer_method, whisper_model=whisper_model).text
             wer_mx = calc_wer(asr_mx, asr_tt).wer * 100  # mixture wer
             wer_tge = calc_wer(asr_tge, asr_tt).wer * 100  # target estimate wer
@@ -561,24 +897,21 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
             else:
                 wer_pi = 100 * (wer_mx - wer_tge) / wer_mx
         else:
-            if MP_GLOBAL.mixdb.mixtures[mixid].snr != -99:
-                print(f'Warning: mixid {mixid} asr truth is empty, setting to 0% wer')
-            asr_mx = ''
-            asr_tge = ''
+            print(f'Warning: mixid {mixid} asr truth is empty, setting to 0% wer')
             wer_mx = float(0)
             wer_tge = float(0)
             wer_pi = float(0)
     # 5) Save per mixture metric results
     # Single row in table of scalar metrics per mixture
-    mtable1_col = ['MXSNR', 'MXPESQ', 'PESQ', 'PESQi%', 'MXWER', 'WER', 'WERi%', 'WSDR', 'SASDR',
-                   'PCM', 'SPLERR', 'NLERR', 'PD', 'SPFILE', 'NFILE']
-    ti = MP_GLOBAL.mixdb.mixtures[mixid].target_id[0]
-    ni = MP_GLOBAL.mixdb.mixtures[mixid].noise_id
-    metr1 = [MP_GLOBAL.mixdb.mixtures[mixid].snr, pesq_mixture, pesq_speech, pesq_impr_pc, wer_mx, wer_tge, wer_pi,
-             wsdr,
-             sa_sdr, pcm, lerr_tg, lerr_n, phd, basename(MP_GLOBAL.mixdb.targets[ti].name),
-             basename(MP_GLOBAL.mixdb.noises[ni].name)]
+    mtable1_col = ['MXSNR', 'MXPESQ', 'PESQ', 'PESQi%', 'MXWER', 'WER', 'WERi%', 'WSDR', 'STOI',
+                   'PCM', 'SPLERR', 'NLERR', 'PD', 'MXCSIG', 'CSIG', 'MXCBAK', 'CBAK', 'MXCOVL', 'COVL',
+                   'SPFILE', 'NFILE']
+    ti = mixdb.mixture(mixid).target_id[0]
+    ni = mixdb.mixture(mixid).noise_id
+    metr1 = [mixdb.mixture(mixid).snr, pesq_mixture, pesq_speech, pesq_impr_pc, wer_mx, wer_tge, wer_pi, wsdr,
+             target_stoi, pcm, lerr_tg, lerr_n, phd, csig_mx, csig_tg, cbak_mx, cbak_tg, covl_mx, covl_tg,
+             basename(mixdb.target(ti).name), basename(mixdb.noise(ni).name)]
     mtab1 = pd.DataFrame([metr1], columns=mtable1_col, index=[mixid])
     # Stats of per frame estimation metrics
@@ -588,7 +921,8 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
                           'NLERR': lerr_n_frame,
                           'SPD':   phd_frame})
     metr2 = metr2.describe()  # Use pandas stat function
-    metr2['SSNR'][1:] = metr2['SSNR'][1:].apply(lambda x: 10 * np.log10(x))  # Change SSNR stats to dB, except count
+    metr2['SSNR'][1:] = metr2['SSNR'][1:].apply(
+        lambda x: 10 * np.log10(x + 1.01e-10))  # Change SSNR stats to dB, except count
     # create a single row in multi-column header
     new_labels = pd.MultiIndex.from_product([metr2.columns,
                                              ['Avg', 'Min', 'Med', 'Max', 'Std']],
@@ -597,7 +931,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     mtab2 = pd.DataFrame(dat1row,
                          index=[mixid],
                          columns=new_labels)
-    mtab2.insert(0, 'MXSNR', MP_GLOBAL.mixdb.mixtures[mixid].snr, False)  # add MXSNR as the first metric column
+    mtab2.insert(0, 'MXSNR', mixdb.mixture(mixid).snr, False)  # add MXSNR as the first metric column
     all_metrics_table_1 = mtab1  # return to be collected by process
     all_metrics_table_2 = mtab2  # return to be collected by process
@@ -610,41 +944,44 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         print(f'Extraction statistics over {mixture_f.shape[0]} frames:', file=f)
         print(metr2.round(2).to_string(float_format=lambda x: "{:.2f}".format(x)), file=f)
         print('', file=f)
-        print(f'Target path: {MP_GLOBAL.mixdb.targets[ti].name}', file=f)
-        print(f'Noise path: {MP_GLOBAL.mixdb.noises[ni].name}', file=f)
-        if MP_GLOBAL.wer_method != 'none':
-            print(f'WER method: {MP_GLOBAL.wer_method} and whisper model (if used):  {MP_GLOBAL.whisper_model}', file=f)
-            if MP_GLOBAL.mixdb.asr_manifests:
+        print(f'Target path: {mixdb.target(ti).name}', file=f)
+        print(f'Noise path: {mixdb.noise(ni).name}', file=f)
+        if wer_method != 'none':
+            print(f'WER method: {wer_method} and whisper model (if used):  {whisper_model}', file=f)
+            if mixdb.asr_manifests:
                 print(f'ASR truth from manifest:  {asr_tt}', file=f)
             else:
                 print(f'ASR truth from wer method:  {asr_tt}', file=f)
             print(f'ASR result for mixture:  {asr_mx}', file=f)
             print(f'ASR result for prediction:  {asr_tge}', file=f)
-        # print(f'PESQ improvement: {pesq_impr:0.2f}, {pesq_impr_pc:0.1f}%', file=f)
+        print(f'Augmentations: {mixdb.mixture(mixid)}', file=f)
     # 7) write wav files
-    if MP_GLOBAL.enable_wav:
+    if enable_wav:
         write_wav(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
         write_wav(name=base_name + '_target.wav', audio=float_to_int16(target))
+        # write_wav(name=base_name + '_targetfi.wav', audio=float_to_int16(targetfi))
         write_wav(name=base_name + '_noise.wav', audio=float_to_int16(noise))
         write_wav(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
         write_wav(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
         # debug code to test for perfect reconstruction of the extraction method
         # note both 75% olsa-hanns and 50% olsa-hann modes checked to have perfect reconstruction
-        # target_r = inverse_transform(target_f, MP_GLOBAL.mixdb.it_config)
-        # noise_r = inverse_transform(noise_f, MP_GLOBAL.mixdb.it_config)
+        # target_r = mixdb.inverse_transform(target_f)
+        # noise_r = mixdb.inverse_transform(noise_f)
         # _write_wav(name=base_name + '_target_r.wav', audio=float_to_int16(target_r))
         # _write_wav(name=base_name + '_noise_r.wav', audio=float_to_int16(noise_r)) # chk perfect rec
     # 8) Write out plot file
-    if MP_GLOBAL.enable_plot:
+    if enable_plot:
+        from matplotlib.backends.backend_pdf import PdfPages
         plot_fname = base_name + '_metric_spenh.pdf'
         # Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
         # Original size (frames, stride, num_bands), decimates in stride dimension only if step is > 1
         # Reshape to get frames*decimated_stride, num_bands
-        step = int(MP_GLOBAL.mixdb.feature_samples / MP_GLOBAL.mixdb.feature_step_samples)
+        step = int(mixdb.feature_samples / mixdb.feature_step_samples)
         if feature.ndim != 3:
             raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, num_bands')
@@ -656,18 +993,17 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         with PdfPages(plot_fname) as pdf:
             # page1 we always have a mixture and prediction, target optional if truth provided
-            tfunc_name = MP_GLOBAL.mixdb.targets[0].truth_settings[0].function  # first target, assumes all have same
-            match tfunc_name:
-                case 'mapped_snr_f':
-                    # leave as unmapped snr
-                    predplot = predict
-                    tfunc_name = MP_GLOBAL.mixdb.targets[0].truth_settings[0].function
-                case 'target_f' | 'target_mixture_f':
-                    predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
-                case _:
-                    # use dB scale
-                    predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
-                    tfunc_name = tfunc_name + ' (db)'
+            tfunc_name = mixdb.target(1).truth_settings[0].function  # first target, assumes all have same
+            if tfunc_name == 'mapped_snr_f':
+                # leave as unmapped snr
+                predplot = predict
+                tfunc_name = mixdb.target(1).truth_settings[0].function
+            elif tfunc_name == 'target_f' or 'target_mixture_f':
+                predplot = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
+            else:
+                # use dB scale
+                predplot = 10 * np.log10(predict + np.finfo(np.float32).eps)
+                tfunc_name = tfunc_name + ' (db)'
             mixspec = 20 * np.log10(abs(mixture_f) + np.finfo(np.float32).eps)
             pdf.savefig(plot_mixpred(mixture=mixture,
@@ -710,7 +1046,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     return all_metrics_table_1, all_metrics_table_2
-def main() -> None:
+def main():
     from docopt import docopt
     import sonusai
@@ -729,17 +1065,20 @@ def main() -> None:
     truth_location = args['TLOC']
     whisper_model = args['--whisper-model'].lower()
-    from glob import glob
+    import glob
     from os.path import basename
     from os.path import isdir
     from os.path import join
+    from os.path import split
     from tqdm import tqdm
     from sonusai import create_file_handler
     from sonusai import initial_log_messages
+    from sonusai import logger
     from sonusai import update_console_handler
-    from sonusai.mixture import DEFAULT_SPEECH
+    from sonusai.mixture import DEFAULT_NOISE
+    from sonusai.mixture import MixtureDatabase
     from sonusai.mixture import read_audio
     from sonusai.utils import calc_asr
     from sonusai.utils import pp_tqdm_imap
@@ -749,12 +1088,19 @@ def main() -> None:
         print(f'The specified predict location {predict_location} is not a valid subdirectory path, exiting ...')
     # allpfiles = listdir(predict_location)
-    allph5files = glob(predict_location + "/*.h5")
-    predict_logfile = glob(predict_location + "/*predict.log")
-    if len(allph5files) <= 0 and not truth_est_mode:
-        print(f'Subdirectory {predict_location} has no files with .h5 extension, exiting ...')
+    allpfiles = glob.glob(predict_location + "/*.h5")
+    predict_logfile = glob.glob(predict_location + "/*predict.log")
+    predwav_mode = False
+    if len(allpfiles) <= 0 and not truth_est_mode:
+        allpfiles = glob.glob(predict_location + "/*.wav")  # check for wav files
+        if len(allpfiles) <= 0:
+            print(f'Subdirectory {predict_location} has no .h5 or .wav files, exiting ...')
+        else:
+            logger.info(f'Found {len(allpfiles)} prediction .wav files.')
+            predwav_mode = True
     else:
-        logger.info(f'Found {len(allph5files)} prediction .h5 files.')
+        logger.info(f'Found {len(allpfiles)} prediction .h5 files.')
     if len(predict_logfile) == 0:
         logger.info(f'Warning, predict location {predict_location} has no prediction log files.')
     else:
@@ -767,52 +1113,61 @@ def main() -> None:
     mixdb = MixtureDatabase(truth_location)
     mixids = mixdb.mixids_to_list(mixids)
-    logger.info(f'Found mixdb of {mixdb.num_mixtures} total mixtures, '
-                f'with {mixdb.num_classes} classes in {truth_location}')
+    logger.info(
+        f'Found mixdb of {mixdb.num_mixtures} total mixtures, with {mixdb.num_classes} classes in {truth_location}')
     logger.info(f'Only running specified subset of {len(mixids)} mixtures')
     enable_asr_warmup = False
-    match wer_method:
-        case 'none':
-            fnb = 'metric_spenh_'
-        case 'google':
-            fnb = 'metric_spenh_ggl_'
-            logger.info(f'WER enabled with method {wer_method}')
-            enable_asr_warmup = True
-        case 'deepgram':
-            fnb = 'metric_spenh_dgram_'
-            logger.info(f'WER enabled with method {wer_method}')
-            enable_asr_warmup = True
-        case 'aixplain_whisper':
-            fnb = 'metric_spenh_whsp_' + whisper_model + '_'
-            logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
-            enable_asr_warmup = True
-        case 'whisper':
-            fnb = 'metric_spenh_whspl_' + whisper_model + '_'
-            logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
-            enable_asr_warmup = True
-        case _:
-            logger.error(f'Unrecognized WER method: {wer_method}')
-            return
+    if wer_method == 'none':
+        fnb = 'metric_spenh_'
+    elif wer_method == 'google':
+        fnb = 'metric_spenh_ggl_'
+        logger.info(f'WER enabled with method {wer_method}')
+        enable_asr_warmup = True
+    elif wer_method == 'deepgram':
+        fnb = 'metric_spenh_dgram_'
+        logger.info(f'WER enabled with method {wer_method}')
+        enable_asr_warmup = True
+    elif wer_method == 'aixplain_whisper':
+        fnb = 'metric_spenh_whspx_' + whisper_model + '_'
+        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+        enable_asr_warmup = True
+    elif wer_method == 'whisper':
+        fnb = 'metric_spenh_whspl_' + whisper_model + '_'
+        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+        enable_asr_warmup = True
+    elif wer_method == 'aaware_whisper':
+        fnb = 'metric_spenh_whspaaw_' + whisper_model + '_'
+        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+        enable_asr_warmup = True
+    elif wer_method == 'fastwhisper':
+        fnb = 'metric_spenh_fwhsp_' + whisper_model + '_'
+        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+        enable_asr_warmup = True
+    else:
+        logger.error(f'Unrecognized WER method: {wer_method}')
+        return
     if enable_asr_warmup:
+        DEFAULT_SPEECH = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
         audio = read_audio(DEFAULT_SPEECH)
-        logger.info(f'Warming up ASR method, note for cloud service this could take up to a few minutes ...')
+        logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
         asr_chk = calc_asr(audio, engine=wer_method, whisper_model_name=whisper_model)
         logger.info(f'Warmup completed, results {asr_chk}')
-    # Individual mixtures use pandas print, set precision to 2 decimal places
-    # pd.set_option('float_format', '{:.2f}'.format)
     MP_GLOBAL.mixdb = mixdb
     MP_GLOBAL.predict_location = predict_location
+    MP_GLOBAL.predwav_mode = predwav_mode
     MP_GLOBAL.truth_est_mode = truth_est_mode
     MP_GLOBAL.enable_plot = enable_plot
     MP_GLOBAL.enable_wav = enable_wav
     MP_GLOBAL.wer_method = wer_method
     MP_GLOBAL.whisper_model = whisper_model
-    progress = tqdm(total=len(mixids))
-    all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress)
+    # Individual mixtures use pandas print, set precision to 2 decimal places
+    # pd.set_option('float_format', '{:.2f}'.format)
+    progress = tqdm(total=len(mixids), desc='calc_metric_spenh')
+    all_metrics_tables = pp_tqdm_imap(_process_mixture, mixids, progress=progress, num_cpus=None)
     progress.close()
     all_metrics_table_1 = pd.concat([item[0] for item in all_metrics_tables])
@@ -842,29 +1197,32 @@ def main() -> None:
     mtab_snr_summary['PESQi%'] = 100 * (mtab_snr_summary['PESQ'] - mtab_snr_summary['MXPESQ']) / np.maximum(
         mtab_snr_summary['MXPESQ'], 0.01)
     for i in range(len(mtab_snr_summary)):
-        tmp_mxwer = mtab_snr_summary['MXWER'].iloc[i]
-        tmp_wer = mtab_snr_summary['WER'].iloc[i]
-        if tmp_mxwer == 0.0:
-            if tmp_wer == 0.0:
+        if mtab_snr_summary['MXWER'].iloc[i] == 0.0:
+            if mtab_snr_summary['WER'].iloc[i] == 0.0:
                 mtab_snr_summary['WERi%'].iloc[i] = 0.0
             else:
                 mtab_snr_summary['WERi%'].iloc[i] = -999.0
         else:
-            mtab_snr_summary['WERi%'].iloc[i] = 100 * (tmp_mxwer - tmp_wer) / tmp_mxwer
+            mtab_snr_summary['WERi%'].iloc[i] = 100 * (mtab_snr_summary['MXWER'].iloc[i] -
+                                                       mtab_snr_summary['WER'].iloc[i]) / \
+                                                mtab_snr_summary['MXWER'].iloc[i]
     # Calculate avg metrics over all mixtures except -99
     all_mtab1_sorted_nom99 = all_mtab1_sorted[all_mtab1_sorted.MXSNR != -99]
     all_nom99_mean = all_mtab1_sorted_nom99.mean(numeric_only=True)
     # correct the percentage averages with a direct calculation (PESQ% and WER%):
-    all_nom99_mean[3] = 100 * (all_nom99_mean[2] - all_nom99_mean[1]) / np.maximum(all_nom99_mean[1], 0.01)  # pesq%
-    if all_nom99_mean[4] == 0.0:
-        if all_nom99_mean[5] == 0.0:
-            all_nom99_mean[6] = 0.0
+    # ser.iloc[pos]
+    all_nom99_mean['PESQi%'] = (100 * (all_nom99_mean['PESQ'] - all_nom99_mean['MXPESQ'])
+                                / np.maximum(all_nom99_mean['MXPESQ'], 0.01))  # pesq%
+    # all_nom99_mean[3] = 100 * (all_nom99_mean[2] - all_nom99_mean[1]) / np.maximum(all_nom99_mean[1], 0.01)  # pesq%
+    if all_nom99_mean['MXWER'] == 0.0:
+        if all_nom99_mean['WER'] == 0.0:
+            all_nom99_mean['WERi%'] = 0.0
         else:
-            all_nom99_mean[6] = -999.0
-    else:
-        all_nom99_mean[6] = 100 * (all_nom99_mean[4] - all_nom99_mean[5]) / all_nom99_mean[4]  # wer%
+            all_nom99_mean['WERi%'] = -999.0
+    else:  # wer%
+        all_nom99_mean['WERi%'] = 100 * (all_nom99_mean['MXWER'] - all_nom99_mean['WER']) / all_nom99_mean['MXWER']
     num_mix = len(mixids)
     if num_mix > 1:

sonusai/data_generator/torch_from_mixdb.py CHANGED Viewed

@@ -32,7 +32,7 @@ class MixtureDatabaseDataset(Dataset):
     def __len__(self):
         return len(self.mixids)
-    def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray]:
+    def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray, int]:
         """Get data from one mixture
         """
         import random
@@ -68,7 +68,7 @@ class MixtureDatabaseDataset(Dataset):
                 feature = feature[start:start + self.cut_len]
                 truth = truth[start:start + self.cut_len]
-        return feature, truth
+        return feature, truth, idx
 class AawareDataLoader(DataLoader):

sonusai/mixture/audio.py CHANGED Viewed

@@ -59,6 +59,7 @@ def get_duration(audio: AudioT) -> float:
     :return: Duration of audio in seconds
     """
     from .constants import SAMPLE_RATE
     return len(audio) / SAMPLE_RATE
@@ -66,14 +67,15 @@ def validate_input_file(input_filepath: str) -> None:
     from os.path import exists
     from os.path import splitext
-    from torchaudio.utils.sox_utils import list_read_formats
+    from soundfile import available_formats
     from sonusai import SonusAIError
     if not exists(input_filepath):
         raise SonusAIError(f'input_filepath {input_filepath} does not exist.')
     ext = splitext(input_filepath)[1][1:].lower()
-    read_formats = list_read_formats()
+    read_formats = [item.lower() for item in available_formats().keys()]
     if ext not in read_formats:
         raise SonusAIError(f'This installation of SoX cannot process .{ext} files')
@@ -86,6 +88,7 @@ def read_audio(name: Location) -> AudioT:
     :return: Array of time domain audio data
     """
     from .torchaudio_audio import read_torchaudio_audio
     return read_torchaudio_audio(name)
@@ -97,4 +100,5 @@ def read_ir(name: Location) -> ImpulseResponseData:
     :return: ImpulseResponseData object
     """
     from .torchaudio_audio import read_torchaudio_ir
     return read_torchaudio_ir(name)

sonusai/mixture/torchaudio_audio.py CHANGED Viewed

@@ -21,7 +21,7 @@ def read_torchaudio_ir(name: Location) -> ImpulseResponseData:
     # Read impulse response data from audio file
     try:
-        raw, sample_rate = torchaudio.load(expanded_name)
+        raw, sample_rate = torchaudio.load(expanded_name, backend='soundfile')
     except Exception as e:
         if name != expanded_name:
             raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}): {e}')
@@ -58,7 +58,7 @@ def read_torchaudio_audio(name: Location) -> AudioT:
     expanded_name, _ = tokenized_expand(name)
     try:
-        out, samplerate = torchaudio.load(expanded_name)
+        out, samplerate = torchaudio.load(expanded_name, backend='soundfile')
         out = torch.reshape(out[0, :], (1, out.size()[1]))
         if not samplerate == SAMPLE_RATE:

{sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonusai
-Version: 0.14.0
+Version: 0.14.2
 Summary: Framework for building deep neural network models for sound, speech, and voice AI
 Home-page: https://aaware.com
 License: GPL-3.0-only
@@ -30,9 +30,11 @@ Requires-Dist: paho-mqtt (>=1.6.1,<2.0.0)
 Requires-Dist: pandas (>=2.1.1,<3.0.0)
 Requires-Dist: pesq (>=0.0.4,<0.0.5)
 Requires-Dist: pyaaware (>=1.5.3,<2.0.0)
+Requires-Dist: pystoi (>=0.3.3,<0.4.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)
 Requires-Dist: scikit-learn (>=1.3.1,<2.0.0)
 Requires-Dist: sh (>=2.0.6,<3.0.0)
+Requires-Dist: soundfile (>=0.12.1,<0.13.0)
 Requires-Dist: sox (>=1.4.1,<2.0.0)
 Requires-Dist: speechrecognition (>=3.10.0,<4.0.0)
 Requires-Dist: sqlalchemy[mypy] (>=2.0.22,<3.0.0)

{sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 sonusai/__init__.py,sha256=KmIJ9wni9d9v5pyu0pUxbacZIHGkAywB9CJwl7JME28,1526
 sonusai/aawscd_probwrite.py,sha256=GukR5owp_0A3DrqSl9fHWULYgclNft4D5OkHIwfxxkc,3698
-sonusai/calc_metric_spenh.py,sha256=3RJnSjhh8ZywRkGsYKfaww7YAXdP_R27m5Bn7EkbIX8,44297
+sonusai/calc_metric_spenh.py,sha256=cE5lexBq6nZHY7-zudqsMsoz5fFYqVAWgKk21dIlHSw,60810
 sonusai/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonusai/data/genmixdb.yml,sha256=6C1GUr_0P5_hEAYSn0MLAqoSzDk_rP8TyV0sVMZqz1Q,16233
 sonusai/data/speech_ma01_01.wav,sha256=PK0vMKg-NR6rPE3KouxHGF6PKXnJCr7AwjMqfu98LUA,76644
@@ -8,7 +8,7 @@ sonusai/data/whitenoise.wav,sha256=I2umov0m34y56F9IsIBi1XtE76ZeZaSKDf70cJRe3pI,1
 sonusai/data_generator/__init__.py,sha256=ouCpY5EDV35fKFeKGQfIcU8uE-c3QcuNerTxUA1X5L8,232
 sonusai/data_generator/dataset_from_mixdb.py,sha256=4eQjyZ2TM2FVgbS9Cy8nevfYMBaIyrmHtUiQzJN19Do,5469
 sonusai/data_generator/keras_from_mixdb.py,sha256=V5CUsGz-akIYdgQy9ABxwNKMYKv01klA4xtMDveF6uI,6167
-sonusai/data_generator/torch_from_mixdb.py,sha256=N5PFpmXJFUgewUUdGaUl1VPP95W1CYuxIWiKOe8Y56g,4269
+sonusai/data_generator/torch_from_mixdb.py,sha256=lvEe9DDu_rIaoyhv9PW4UAnAWp5N74L8kRfxUhsh7oo,4279
 sonusai/evaluate.py,sha256=OH9g3l8yD4X-HHUf-qQriznXQJSW0gtf7XO4P-jbo1U,10025
 sonusai/genft.py,sha256=CnBiQKHQHZMlrq-F1QQJfpw-_45uhyut8cY-O7oDrTk,5557
 sonusai/genmix.py,sha256=l3n-vvSDtwIvYNw9Ulkn5fgAeoyh7reQgGE4Vvth334,7016
@@ -33,7 +33,7 @@ sonusai/metrics/confusion_matrix_summary.py,sha256=3qg6TMKjJeHtNjj2YnNjPFSlMrQXt
 sonusai/metrics/one_hot.py,sha256=lq58zKw0X9sdhJYGEldAkxPFqP3UOYpG_KdxkGHF_3c,13540
 sonusai/metrics/snr_summary.py,sha256=P4U5_Xr7v9F8kF-rZBnpsVNt3p42rIVS6zmch8yfVfg,5575
 sonusai/mixture/__init__.py,sha256=xlGw2FXoMZm2ra97GVfpJ-OTOp10d4dly8AXe8eJwhI,5294
-sonusai/mixture/audio.py,sha256=13zBg-ix3SC7xzFFX9WcGUuWowodRVI6pryWRmH7YmY,3221
+sonusai/mixture/audio.py,sha256=3pat-AIG_FXiGr3aPRa7DSLzolH3PodVDtve-xUuXfk,3242
 sonusai/mixture/augmentation.py,sha256=HwYUJCSmRBWhdnzqKz5zZnMANT83GzJkDrPcWUm6jbg,10884
 sonusai/mixture/class_count.py,sha256=27YDu1puarhp7Rd4EYWGJ-FHP8rAYGd55I6abGqCscY,988
 sonusai/mixture/config.py,sha256=QrasMP-2NGocse2rF_oYkRluDDPo-czFLDEwKtQ8A54,23629
@@ -50,7 +50,7 @@ sonusai/mixture/spectral_mask.py,sha256=qHR2DBpbtz4u1o9sdFMRsUDVUjbof_MRKPW8uY4R
 sonusai/mixture/target_class_balancing.py,sha256=P3gLe2SFos5_N2LWiVFwD-fa_imZH2f1qBiI55BeqXI,4768
 sonusai/mixture/targets.py,sha256=n7PenQuU0pPM_LLXJHmUZ3VeSGDEk7Kdf8y473Xdm6Q,7395
 sonusai/mixture/tokenized_shell_vars.py,sha256=gCxw8SQUcal6mqWKF7hOBTgSQmbJUk1nT0Gn3H8GA0U,4705
-sonusai/mixture/torchaudio_audio.py,sha256=HL11-1_UK9QuvCP17cAnlfsBLZzR0aqLif7gRexxySM,2323
+sonusai/mixture/torchaudio_audio.py,sha256=qMYXeOSI8U8zaT9x0knPg1dHWzYmswZk7oFGAMG0Jks,2365
 sonusai/mixture/torchaudio_augmentation.py,sha256=LrG19X71UYKMr69WNgJs2R4OTt1QBYu_h8WL5a4ERyE,4462
 sonusai/mixture/truth.py,sha256=Is-nqLXIBM7wjYbS6yzy8mnR8JqxwSabnVHsza0rh_E,1427
 sonusai/mixture/truth_functions/__init__.py,sha256=82lKYHhLy8KW3gHngrocoqwupGVLVsWdIXdYs3vhjOc,359
@@ -114,7 +114,7 @@ sonusai/utils/trim_docstring.py,sha256=dSrtiRsEN4wkkvKBp6WDr13RUypfqZzgH_jOBLs1o
 sonusai/utils/wave.py,sha256=OZe8iVLbKSFv_GdQzLD9hJdBiqimK4FxJ0lVoDbbiqQ,572
 sonusai/utils/yes_or_no.py,sha256=eMLXBVH0cEahiXY4W2KNORmwNQ-ba10eRtldh0y4NYg,263
 sonusai/vars.py,sha256=m2AefF0m5bXWGXpJj8Pi42zWL2ydeEj7bkak3GrtMyM,940
-sonusai-0.14.0.dist-info/METADATA,sha256=DZPAjTdEyjACmrkTgbc0nrP7S-U-IXmqN363HbuCTvE,2736
-sonusai-0.14.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-sonusai-0.14.0.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
-sonusai-0.14.0.dist-info/RECORD,,
+sonusai-0.14.2.dist-info/METADATA,sha256=RR8bQ-ZUGFqZZJID86OMAAM6N0h7MYpfwJlDYf4t0v4,2819
+sonusai-0.14.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+sonusai-0.14.2.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
+sonusai-0.14.2.dist-info/RECORD,,

{sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sonusai-0.14.0.dist-info → sonusai-0.14.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

sonusai 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl

sonusai 0.14.0py3-none-any.whl → 0.14.2py3-none-any.whl