PyPI - sonusai - Versions diffs - 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl - Mend

sonusai 0.17.0py3-none-any.whl → 0.17.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

sonusai/audiofe.py +22 -51
sonusai/calc_metric_spenh.py +206 -213
sonusai/doc/doc.py +1 -1
sonusai/mixture/__init__.py +2 -0
sonusai/mixture/audio.py +12 -0
sonusai/mixture/datatypes.py +11 -3
sonusai/mixture/mixdb.py +101 -0
sonusai/mixture/soundfile_audio.py +39 -0
sonusai/mixture/speaker_metadata.py +35 -0
sonusai/mixture/torchaudio_audio.py +22 -0
sonusai/mkmanifest.py +1 -1
sonusai/onnx_predict.py +114 -410
sonusai/queries/queries.py +1 -1
sonusai/speech/__init__.py +3 -0
sonusai/speech/l2arctic.py +116 -0
sonusai/speech/librispeech.py +99 -0
sonusai/speech/mcgill.py +70 -0
sonusai/speech/textgrid.py +100 -0
sonusai/speech/timit.py +135 -0
sonusai/speech/types.py +12 -0
sonusai/speech/vctk.py +52 -0
sonusai/speech/voxceleb2.py +86 -0
sonusai/utils/__init__.py +2 -1
sonusai/utils/asr_manifest_functions/__init__.py +0 -1
sonusai/utils/asr_manifest_functions/data.py +0 -8
sonusai/utils/asr_manifest_functions/librispeech.py +1 -1
sonusai/utils/asr_manifest_functions/mcgill_speech.py +1 -1
sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +1 -1
sonusai/utils/braced_glob.py +7 -3
sonusai/utils/onnx_utils.py +110 -106
sonusai/utils/path_info.py +7 -0
{sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/METADATA +2 -1
{sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/RECORD +35 -30
{sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/WHEEL +1 -1
sonusai/calc_metric_spenh-save.py +0 -1334
sonusai/onnx_predict-old.py +0 -240
sonusai/onnx_predict-save.py +0 -487
sonusai/ovino_predict.py +0 -508
sonusai/ovino_query_devices.py +0 -47
sonusai/torchl_onnx-old.py +0 -216
{sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/entry_points.txt +0 -0

sonusai/calc_metric_spenh.py CHANGED Viewed

@@ -1,27 +1,25 @@
 """sonusai calc_metric_spenh
-usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e WER] [-m WMNAME] PLOC TLOC
+usage: calc_metric_spenh [-hvtpws] [-i MIXID] [-e ASR] [-m MODEL] PLOC TLOC
 options:
     -h, --help
     -v, --verbose               Be verbose.
-    -i MIXID, --mixid MIXID     Mixture ID(s) to process, can be range like 0:maxmix+1 [default: *].
+    -i MIXID, --mixid MIXID     Mixture ID(s) to process, can be range like 0:maxmix+1. [default: *]
     -t, --truth-est-mode        Calculate extraction and metrics using truth (instead of prediction).
     -p, --plot                  Enable PDF plots file generation per mixture.
     -w, --wav                   Generate WAV files per mixture.
     -s, --summary               Enable summary files generation.
-    -e WER, --wer-method WER    Word-Error-Rate method: deepgram, google, aixplain_whisper
-                                or whisper (locally run) [default: none]
-    -m WMNAME, --whisper-model  Whisper model name used in aixplain_whisper and whisper WER methods.
-                                [default: tiny]
+    -e ASR, --asr-method ASR    ASR method: deepgram, google, aixplain_whisper, whisper, or sensory. [default: none]
+    -m MODEL, --model           ASR model name used in some ASR methods. [default: tiny]
-Calculate speech enhancement metrics of prediction data in PLOC using SonusAI mixture data
-in TLOC as truth/label reference. Metric and extraction data files are written into PLOC.
+Calculate speech enhancement metrics of prediction data in PLOC using SonusAI mixture data in TLOC as truth/label
+reference. Metric and extraction data files are written into PLOC.
 PLOC  directory containing prediction data in .h5 files created from truth/label mixture data in TLOC
 TLOC  directory with SonusAI mixture database of truth/label mixture data
-For whisper WER methods, the possible models used in local processing (WER = whisper) are:
+For whisper ASR methods, the possible models used in local processing (ASR = whisper) are:
     {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}
 but note most are very computationally demanding and can overwhelm/hang a local system.
@@ -68,6 +66,7 @@ import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from sonusai.mixture import AudioF
 from sonusai.mixture import AudioT
 from sonusai.mixture import Feature
@@ -93,12 +92,12 @@ matplotlib.use('SVG')
 class MPGlobal:
     mixdb: MixtureDatabase = None
     predict_location: str = None
-    predwav_mode: bool = None
+    predict_wav_mode: bool = None
     truth_est_mode: bool = None
     enable_plot: bool = None
     enable_wav: bool = None
-    wer_method: str = None
-    whisper_model: str = None
+    asr_method: str = None
+    asr_model_name: str = None
 MP_GLOBAL = MPGlobal()
@@ -132,64 +131,62 @@ def snr(clean_speech, processed_speech, sample_rate):
     overall_snr = 10 * np.log10(np.sum(np.square(clean_speech)) / np.sum(np.square(clean_speech - processed_speech)))
     # Global Variables
-    winlength = round(30 * sample_rate / 1000)  # window length in samples
-    skiprate = int(np.floor(winlength / 4))  # window skip in samples
-    MIN_SNR = -10  # minimum SNR in dB
-    MAX_SNR = 35  # maximum SNR in dB
+    win_length = round(30 * sample_rate / 1000)  # window length in samples
+    skip_rate = int(np.floor(win_length / 4))  # window skip in samples
+    min_snr = -10  # minimum SNR in dB
+    max_snr = 35  # maximum SNR in dB
     # For each frame of input speech, calculate the Segmental SNR
-    num_frames = int(clean_length / skiprate - (winlength / skiprate))  # number of frames
+    num_frames = int(clean_length / skip_rate - (win_length / skip_rate))  # number of frames
     start = 0  # starting sample
-    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
     segmental_snr = np.empty(num_frames)
-    EPS = np.spacing(1)
+    eps = np.spacing(1)
     for frame_count in range(num_frames):
         # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
-        clean_frame = clean_speech[start:start + winlength]
-        processed_frame = processed_speech[start:start + winlength]
+        clean_frame = clean_speech[start:start + win_length]
+        processed_frame = processed_speech[start:start + win_length]
         clean_frame = np.multiply(clean_frame, window)
         processed_frame = np.multiply(processed_frame, window)
         # (2) Compute the Segmental SNR
         signal_energy = np.sum(np.square(clean_frame))
         noise_energy = np.sum(np.square(clean_frame - processed_frame))
-        segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + EPS) + EPS)
-        segmental_snr[frame_count] = max(segmental_snr[frame_count], MIN_SNR)
-        segmental_snr[frame_count] = min(segmental_snr[frame_count], MAX_SNR)
+        segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + eps) + eps)
+        segmental_snr[frame_count] = np.max(segmental_snr[frame_count], min_snr)
+        segmental_snr[frame_count] = np.min(segmental_snr[frame_count], max_snr)
-        start = start + skiprate
+        start = start + skip_rate
     return overall_snr, segmental_snr
-def lpcoeff(speech_frame, model_order):
+def lp_coefficients(speech_frame, model_order):
     # (1) Compute Autocorrelation Lags
-    winlength = np.size(speech_frame)
-    R = np.empty(model_order + 1)
-    E = np.empty(model_order + 1)
+    win_length = np.size(speech_frame)
+    autocorrelation = np.empty(model_order + 1)
+    e = np.empty(model_order + 1)
     for k in range(model_order + 1):
-        R[k] = np.dot(speech_frame[0:winlength - k], speech_frame[k: winlength])
+        autocorrelation[k] = np.dot(speech_frame[0:win_length - k], speech_frame[k: win_length])
     # (2) Levinson-Durbin
     a = np.ones(model_order)
     a_past = np.empty(model_order)
-    rcoeff = np.empty(model_order)
-    E[0] = R[0]
+    ref_coefficients = np.empty(model_order)
+    e[0] = autocorrelation[0]
     for i in range(model_order):
         a_past[0: i] = a[0: i]
-        sum_term = np.dot(a_past[0: i], R[i:0:-1])
-        rcoeff[i] = (R[i + 1] - sum_term) / E[i]
-        a[i] = rcoeff[i]
+        sum_term = np.dot(a_past[0: i], autocorrelation[i:0:-1])
+        ref_coefficients[i] = (autocorrelation[i + 1] - sum_term) / e[i]
+        a[i] = ref_coefficients[i]
         if i == 0:
-            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], rcoeff[i])
+            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], ref_coefficients[i])
         else:
-            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], rcoeff[i])
-        E[i + 1] = (1 - rcoeff[i] * rcoeff[i]) * E[i]
-    acorr = R
-    refcoeff = rcoeff
-    lpparams = np.concatenate((np.array([1]), -a))
-    return acorr, refcoeff, lpparams
+            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], ref_coefficients[i])
+        e[i + 1] = (1 - ref_coefficients[i] * ref_coefficients[i]) * e[i]
+    lp_params = np.concatenate((np.array([1]), -a))
+    return autocorrelation, ref_coefficients, lp_params
 def llr(clean_speech, processed_speech, sample_rate):
@@ -199,38 +196,38 @@ def llr(clean_speech, processed_speech, sample_rate):
     clean_length = np.size(clean_speech)
     processed_length = np.size(processed_speech)
     if clean_length != processed_length:
-        raise ValueError('Both Speech Files must be same length.')
+        raise ValueError('Both speech files must be same length.')
     # Global Variables
-    winlength = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
-    skiprate = (np.floor(winlength / 4)).astype(int)  # window skip in samples
+    win_length = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
+    skip_rate = (np.floor(win_length / 4)).astype(int)  # window skip in samples
     if sample_rate < 10000:
-        P = 10  # LPC Analysis Order
+        p = 10  # LPC Analysis Order
     else:
-        P = 16  # this could vary depending on sampling frequency.
+        p = 16  # this could vary depending on sampling frequency.
     # For each frame of input speech, calculate the Log Likelihood Ratio
-    num_frames = int((clean_length - winlength) / skiprate)  # number of frames
+    num_frames = int((clean_length - win_length) / skip_rate)  # number of frames
     start = 0  # starting sample
-    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
     distortion = np.empty(num_frames)
     for frame_count in range(num_frames):
         # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
-        clean_frame = clean_speech[start: start + winlength]
-        processed_frame = processed_speech[start: start + winlength]
+        clean_frame = clean_speech[start: start + win_length]
+        processed_frame = processed_speech[start: start + win_length]
         clean_frame = np.multiply(clean_frame, window)
         processed_frame = np.multiply(processed_frame, window)
         # (2) Get the autocorrelation lags and LPC parameters used to compute the LLR measure.
-        R_clean, Ref_clean, A_clean = lpcoeff(clean_frame, P)
-        R_processed, Ref_processed, A_processed = lpcoeff(processed_frame, P)
+        r_clean, ref_clean, a_clean = lp_coefficients(clean_frame, p)
+        r_processed, ref_processed, a_processed = lp_coefficients(processed_frame, p)
         # (3) Compute the LLR measure
-        numerator = np.dot(np.matmul(A_processed, toeplitz(R_clean)), A_processed)
-        denominator = np.dot(np.matmul(A_clean, toeplitz(R_clean)), A_clean)
+        numerator = np.dot(np.matmul(a_processed, toeplitz(r_clean)), a_processed)
+        denominator = np.dot(np.matmul(a_clean, toeplitz(r_clean)), a_clean)
         distortion[frame_count] = np.log(numerator / denominator)
-        start = start + skiprate
+        start = start + skip_rate
     return distortion
@@ -244,16 +241,15 @@ def wss(clean_speech, processed_speech, sample_rate):
         raise ValueError('Files must have same length.')
     # Global variables
-    winlength = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
-    skiprate = (np.floor(np.divide(winlength, 4))).astype(int)  # window skip in samples
+    win_length = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
+    skip_rate = (np.floor(np.divide(win_length, 4))).astype(int)  # window skip in samples
     max_freq = (np.divide(sample_rate, 2)).astype(int)  # maximum bandwidth
     num_crit = 25  # number of critical bands
-    USE_FFT_SPECTRUM = 1  # defaults to 10th order LP spectrum
-    n_fft = (np.power(2, np.ceil(np.log2(2 * winlength)))).astype(int)
-    n_fftby2 = (np.multiply(0.5, n_fft)).astype(int)  # FFT size/2
-    Kmax = 20.0  # value suggested by Klatt, pg 1280
-    Klocmax = 1.0  # value suggested by Klatt, pg 1280
+    n_fft = (np.power(2, np.ceil(np.log2(2 * win_length)))).astype(int)
+    n_fft_by_2 = (np.multiply(0.5, n_fft)).astype(int)  # FFT size/2
+    k_max = 20.0  # value suggested by Klatt, pg 1280
+    k_loc_max = 1.0  # value suggested by Klatt, pg 1280
     # Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
     cent_freq = np.array([50.0000, 120.000, 190.000, 260.000, 330.000, 400.000, 470.000,
@@ -268,39 +264,38 @@ def wss(clean_speech, processed_speech, sample_rate):
     bw_min = bandwidth[0]  # minimum critical bandwidth
     # Set up the critical band filters.
-    # Note here that Gaussianly shaped filters are used.
+    # Note here that Gaussian-ly shaped filters are used.
     # Also, the sum of the filter weights are equivalent for each critical band filter.
     # Filter less than -30 dB and set to zero.
     min_factor = np.exp(-30.0 / (2.0 * 2.303))  # -30 dB point of filter
-    crit_filter = np.empty((num_crit, n_fftby2))
+    crit_filter = np.empty((num_crit, n_fft_by_2))
     for i in range(num_crit):
-        f0 = (cent_freq[i] / max_freq) * n_fftby2
-        bw = (bandwidth[i] / max_freq) * n_fftby2
+        f0 = (cent_freq[i] / max_freq) * n_fft_by_2
+        bw = (bandwidth[i] / max_freq) * n_fft_by_2
         norm_factor = np.log(bw_min) - np.log(bandwidth[i])
-        j = np.arange(n_fftby2)
+        j = np.arange(n_fft_by_2)
         crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
         cond = np.greater(crit_filter[i, :], min_factor)
         crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
     # For each frame of input speech, calculate the Weighted Spectral Slope Measure
-    num_frames = int(clean_length / skiprate - (winlength / skiprate))  # number of frames
+    num_frames = int(clean_length / skip_rate - (win_length / skip_rate))  # number of frames
     start = 0  # starting sample
-    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, winlength + 1) / (winlength + 1)))
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
     distortion = np.empty(num_frames)
     for frame_count in range(num_frames):
         # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
-        clean_frame = clean_speech[start: start + winlength] / 32768
-        processed_frame = processed_speech[start: start + winlength] / 32768
+        clean_frame = clean_speech[start: start + win_length] / 32768
+        processed_frame = processed_speech[start: start + win_length] / 32768
         clean_frame = np.multiply(clean_frame, window)
         processed_frame = np.multiply(processed_frame, window)
         # (2) Compute the Power Spectrum of Clean and Processed
-        # if USE_FFT_SPECTRUM:
         clean_spec = np.square(np.abs(fft(clean_frame, n_fft)))
         processed_spec = np.square(np.abs(fft(processed_frame, n_fft)))
         # (3) Compute Filterbank Output Energies (in dB scale)
-        clean_energy = np.matmul(crit_filter, clean_spec[0:n_fftby2])
-        processed_energy = np.matmul(crit_filter, processed_spec[0:n_fftby2])
+        clean_energy = np.matmul(crit_filter, clean_spec[0:n_fft_by_2])
+        processed_energy = np.matmul(crit_filter, processed_spec[0:n_fft_by_2])
         clean_energy = 10 * np.log10(np.maximum(clean_energy, 1E-10))
         processed_energy = 10 * np.log10(np.maximum(processed_energy, 1E-10))
@@ -340,39 +335,39 @@ def wss(clean_speech, processed_speech, sample_rate):
                 processed_loc_peak[i] = processed_energy[n + 1]
         # (6) Compute the WSS Measure for this frame. This includes determination of the weighting function.
-        dBMax_clean = np.max(clean_energy)
-        dBMax_processed = np.max(processed_energy)
+        db_max_clean = np.max(clean_energy)
+        db_max_processed = np.max(processed_energy)
         '''
         The weights are calculated by averaging individual weighting factors from the clean and processed frame.
-        These weights W_clean and W_processed should range from 0 to 1 and place more emphasis on spectral peaks
+        These weights w_clean and w_processed should range from 0 to 1 and place more emphasis on spectral peaks
         and less emphasis on slope differences in spectral valleys.
         This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
         '''
-        Wmax_clean = np.divide(Kmax, Kmax + dBMax_clean - clean_energy[0: num_crit - 1])
-        Wlocmax_clean = np.divide(Klocmax, Klocmax + clean_loc_peak - clean_energy[0: num_crit - 1])
-        W_clean = np.multiply(Wmax_clean, Wlocmax_clean)
+        w_max_clean = np.divide(k_max, k_max + db_max_clean - clean_energy[0: num_crit - 1])
+        w_loc_max_clean = np.divide(k_loc_max, k_loc_max + clean_loc_peak - clean_energy[0: num_crit - 1])
+        w_clean = np.multiply(w_max_clean, w_loc_max_clean)
-        Wmax_processed = np.divide(Kmax, Kmax + dBMax_processed - processed_energy[0: num_crit - 1])
-        Wlocmax_processed = np.divide(Klocmax, Klocmax + processed_loc_peak - processed_energy[0: num_crit - 1])
-        W_processed = np.multiply(Wmax_processed, Wlocmax_processed)
+        w_max_processed = np.divide(k_max, k_max + db_max_processed - processed_energy[0: num_crit - 1])
+        w_loc_max_processed = np.divide(k_loc_max, k_loc_max + processed_loc_peak - processed_energy[0: num_crit - 1])
+        w_processed = np.multiply(w_max_processed, w_loc_max_processed)
-        W = np.divide(np.add(W_clean, W_processed), 2.0)
+        w = np.divide(np.add(w_clean, w_processed), 2.0)
         slope_diff = np.subtract(clean_slope, processed_slope)[0: num_crit - 1]
-        distortion[frame_count] = np.dot(W, np.square(slope_diff)) / np.sum(W)
-        # this normalization is not part of Klatt's paper, but helps to normalize the measure.
+        distortion[frame_count] = np.dot(w, np.square(slope_diff)) / np.sum(w)
+        # This normalization is not part of Klatt's paper, but helps to normalize the measure.
         # Here we scale the measure by the sum of the weights.
-        start = start + skiprate
+        start = start + skip_rate
     return distortion
 def calc_speech_metrics(hypothesis: np.ndarray,
                         reference: np.ndarray) -> tuple[float, int, int, int, float]:
     """
-    Calculate speech metrics pesq_mos, CSIG, CBAK, COVL, segSNR.  These are all related and thus included
+    Calculate speech metrics pesq_mos, c_sig, c_bak, c_ovl, seg_snr.  These are all related and thus included
     in one function. Reference: matlab script "compute_metrics.m".
     Usage:
-        pesq, csig, cbak, covl, ssnr = compute_metrics(hypothesis, reference, Fs, path)
+        pesq, c_sig, c_bak, c_ovl, ssnr = compute_metrics(hypothesis, reference, fs, path)
         reference: clean audio as array
         hypothesis: enhanced audio as array
         Audio must have sampling rate = 16000 Hz.
@@ -383,41 +378,39 @@ def calc_speech_metrics(hypothesis: np.ndarray,
     """
     from sonusai.metrics import calc_pesq
-    Fs = 16000
+    fs = 16000
     # compute the WSS measure
-    wss_dist_vec = wss(reference, hypothesis, Fs)
+    wss_dist_vec = wss(reference, hypothesis, fs)
     wss_dist_vec = np.sort(wss_dist_vec)
     alpha = 0.95  # value from CMGAN ref implementation
     wss_dist = np.mean(wss_dist_vec[0: round(np.size(wss_dist_vec) * alpha)])
     # compute the LLR measure
-    llr_dist = llr(reference, hypothesis, Fs)
+    llr_dist = llr(reference, hypothesis, fs)
     ll_rs = np.sort(llr_dist)
     llr_len = round(np.size(llr_dist) * alpha)
     llr_mean = np.mean(ll_rs[0: llr_len])
     # compute the SNRseg
-    snr_dist, segsnr_dist = snr(reference, hypothesis, Fs)
-    snr_mean = snr_dist
-    segSNR = np.mean(segsnr_dist)
+    snr_dist, segsnr_dist = snr(reference, hypothesis, fs)
+    seg_snr = np.mean(segsnr_dist)
     # compute the pesq (use Sonusai wrapper, only fs=16k, mode=wb support)
     pesq_mos = calc_pesq(hypothesis=hypothesis, reference=reference)
-    # pesq_mos = pesq(sampling_rate1, data1, data2, 'wb')
     # now compute the composite measures
-    CSIG = 3.093 - 1.029 * llr_mean + 0.603 * pesq_mos - 0.009 * wss_dist
-    CSIG = max(1, CSIG)
-    CSIG = min(5, CSIG)  # limit values to [1, 5]
-    CBAK = 1.634 + 0.478 * pesq_mos - 0.007 * wss_dist + 0.063 * segSNR
-    CBAK = max(1, CBAK)
-    CBAK = min(5, CBAK)  # limit values to [1, 5]
-    COVL = 1.594 + 0.805 * pesq_mos - 0.512 * llr_mean - 0.007 * wss_dist
-    COVL = max(1, COVL)
-    COVL = min(5, COVL)  # limit values to [1, 5]
+    c_sig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_mos - 0.009 * wss_dist
+    c_sig = max(1, c_sig)
+    c_sig = min(5, c_sig)  # limit values to [1, 5]
+    c_bak = 1.634 + 0.478 * pesq_mos - 0.007 * wss_dist + 0.063 * seg_snr
+    c_bak = max(1, c_bak)
+    c_bak = min(5, c_bak)  # limit values to [1, 5]
+    c_ovl = 1.594 + 0.805 * pesq_mos - 0.512 * llr_mean - 0.007 * wss_dist
+    c_ovl = max(1, c_ovl)
+    c_ovl = min(5, c_ovl)  # limit values to [1, 5]
-    return pesq_mos, CSIG, CBAK, COVL, segSNR
+    return pesq_mos, c_sig, c_bak, c_ovl, seg_snr
 def mean_square_error(hypothesis: np.ndarray,
@@ -564,10 +557,8 @@ def plot_mixpred(mixture: AudioT,
     ax[p].plot(x_axis, mixture, label='Mixture', color='mistyrose')
     ax[0].set_ylabel('magnitude', color='tab:blue')
     ax[p].set_xlim(x_axis[0], x_axis[-1])
-    # ax[p].set_ylim([-1.025, 1.025])
     if target is not None:  # Plot target time-domain waveform on top of mixture
         ax[0].plot(x_axis, target, label='Target', color='tab:blue')
-        # ax[0].tick_params(axis='y', labelcolor=color)
     ax[p].set_title('Waveform')
     # Plot the mixture spectrogram
@@ -589,10 +580,10 @@ def plot_mixpred(mixture: AudioT,
     return fig
-def plot_pdb_predtruth(predict: np.ndarray,
-                       truth_f: Optional[np.ndarray] = None,
-                       metric: Optional[np.ndarray] = None,
-                       tp_title: str = '') -> plt.Figure:
+def plot_pdb_predict_truth(predict: np.ndarray,
+                           truth_f: Optional[np.ndarray] = None,
+                           metric: Optional[np.ndarray] = None,
+                           tp_title: str = '') -> plt.Figure:
     """Plot predict and optionally truth and a metric in power db, e.g. applies 10*log10(predict)"""
     num_plots = 2
     if truth_f is not None:
@@ -636,16 +627,15 @@ def plot_pdb_predtruth(predict: np.ndarray,
         ax[p].set_title('SNR and SNR mse (mean over freq. db)')
     else:
         ax[p].set_title('SNR (mean over freq. db)')
-        # ax[0].tick_params(axis='y', labelcolor=color)
     return fig
-def plot_epredtruth(predict: np.ndarray,
-                    predict_wav: np.ndarray,
-                    truth_f: Optional[np.ndarray] = None,
-                    truth_wav: Optional[np.ndarray] = None,
-                    metric: Optional[np.ndarray] = None,
-                    tp_title: str = '') -> plt.Figure:
+def plot_e_predict_truth(predict: np.ndarray,
+                         predict_wav: np.ndarray,
+                         truth_f: Optional[np.ndarray] = None,
+                         truth_wav: Optional[np.ndarray] = None,
+                         metric: Optional[np.ndarray] = None,
+                         tp_title: str = '') -> plt.Figure:
     """Plot predict spectrogram and waveform and optionally truth and a metric)"""
     num_plots = 2
     if truth_f is not None:
@@ -666,7 +656,7 @@ def plot_epredtruth(predict: np.ndarray,
         ax[p].imshow(truth_f.transpose(), im.cmap, aspect='auto', interpolation='nearest', origin='lower')
         ax[p].set_title('Truth')
-    # Plot the predict wav, and optionally truth avg and metric lines
+    # Plot predict wav, and optionally truth avg and metric lines
     p += 1
     x_axis = np.arange(len(predict_wav), dtype=np.float32)  # / SAMPLE_RATE
     ax[p].plot(x_axis, predict_wav, color='black', linestyle='dashed', label='Speech Estimate')
@@ -732,12 +722,12 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     mixdb = MP_GLOBAL.mixdb
     predict_location = MP_GLOBAL.predict_location
-    predwav_mode = MP_GLOBAL.predwav_mode
+    predict_wav_mode = MP_GLOBAL.predict_wav_mode
     truth_est_mode = MP_GLOBAL.truth_est_mode
     enable_plot = MP_GLOBAL.enable_plot
     enable_wav = MP_GLOBAL.enable_wav
-    wer_method = MP_GLOBAL.wer_method
-    whisper_model = MP_GLOBAL.whisper_model
+    asr_method = MP_GLOBAL.asr_method
+    asr_model_name = MP_GLOBAL.asr_model_name
     # 1)  Read predict data, var predict with shape [BatchSize,Classes] or [BatchSize,Tsteps,Classes]
     output_name = join(predict_location, mixdb.mixture(mixid).name)
@@ -749,7 +739,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         base_name = splitext(output_name)[0] + '_truest'
     else:
         base_name, ext = splitext(output_name)  # base_name used later
-        if not predwav_mode:
+        if not predict_wav_mode:
             try:
                 with h5py.File(output_name, 'r') as f:
                     predict = np.array(f['predict'])
@@ -761,8 +751,8 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
                 predict, _ = reshape_outputs(predict=predict, truth=None, timesteps=predict.shape[1])
         else:
             base_name, ext = splitext(output_name)
-            prfname = join(base_name + '.wav')
-            audio = read_audio(prfname)
+            predict_name = join(base_name + '.wav')
+            audio = read_audio(predict_name)
             predict = forward_transform(audio, mixdb.ft_config)
             if mixdb.feature[0:1] == 'h':
                 predict = power_compress(predict)
@@ -773,8 +763,8 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     target_f = mixdb.mixture_targets_f(mixid, targets=tmp)[0]
     target = tmp[0]
     mixture = mixdb.mixture_mixture(mixid)  # note: gives full reverberated/distorted target, but no specaugment
-    # noise_wodist = mixdb.mixture_noise(mixid)            # noise without specaugment and distortion
-    # noise_wodist_f = mixdb.mixture_noise_f(mixid, noise=noise_wodist)
+    # noise_wo_dist = mixdb.mixture_noise(mixid)            # noise without specaugment and distortion
+    # noise_wo_dist_f = mixdb.mixture_noise_f(mixid, noise=noise_wo_dist)
     noise = mixture - target  # has time-domain distortion (ir,etc.) but does not have specaugment
     # noise_f = mixdb.mixture_noise_f(mixid, noise=noise)
     segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)  # note: uses pre-IR, pre-specaug audio
@@ -784,9 +774,9 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     segsnr_f[segsnr_f == inf] = 7.944e8  # 99db
     segsnr_f[segsnr_f == -inf] = 1.258e-10  # -99db
     # need to use inv-tf to match #samples & latency shift properties of predict inv tf
-    targetfi = inverse_transform(target_f, mixdb.it_config)
-    noisefi = inverse_transform(noise_f, mixdb.it_config)
-    # mixturefi = mixdb.inverse_transform(mixture_f)
+    target_fi = inverse_transform(target_f, mixdb.it_config)
+    noise_fi = inverse_transform(noise_f, mixdb.it_config)
+    # mixture_fi = mixdb.inverse_transform(mixture_f)
     # gen feature, truth - note feature only used for plots
     # TBD parse truth_f for different formats and also multi-truth
@@ -798,17 +788,17 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     if not truth_est_mode:
         if predict.shape[0] < target_f.shape[0]:  # target_f, truth_f, mixture_f, etc. same size
-            trimf = target_f.shape[0] - predict.shape[0]
-            logger.debug(f'Warning: prediction frames less than mixture, trimming {trimf} frames from all truth.')
-            target_f = target_f[0:-trimf, :]
-            targetfi, _ = inverse_transform(target_f, mixdb.it_config)
-            trimt = target.shape[0] - targetfi.shape[0]
-            target = target[0:-trimt]
-            noise_f = noise_f[0:-trimf, :]
-            noise = noise[0:-trimt]
-            mixture_f = mixture_f[0:-trimf, :]
-            mixture = mixture[0:-trimt]
-            truth_f = truth_f[0:-trimf, :]
+            trim_f = target_f.shape[0] - predict.shape[0]
+            logger.debug(f'Warning: prediction frames less than mixture, trimming {trim_f} frames from all truth.')
+            target_f = target_f[0:-trim_f, :]
+            target_fi, _ = inverse_transform(target_f, mixdb.it_config)
+            trim_t = target.shape[0] - target_fi.shape[0]
+            target = target[0:-trim_t]
+            noise_f = noise_f[0:-trim_f, :]
+            noise = noise[0:-trim_t]
+            mixture_f = mixture_f[0:-trim_f, :]
+            mixture = mixture[0:-trim_t]
+            truth_f = truth_f[0:-trim_f, :]
         elif predict.shape[0] > target_f.shape[0]:
             raise SonusAIError(
                 f'Error: prediction has more frames than true mixture {predict.shape[0]} vs {truth_f.shape[0]}')
@@ -848,14 +838,14 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     phd, phd_bin, phd_frame = phase_distance(hypothesis=predict_complex, reference=truth_f_complex)
     # Noise td logerr
-    # lerr_nt, lerr_nt_bin, lerr_nt_frame = log_error(noisefi, noise_truth_est_audio)
+    # lerr_nt, lerr_nt_bin, lerr_nt_frame = log_error(noise_fi, noise_truth_est_audio)
     # # SA-SDR (time-domain source-aggragated SDR)
-    ytrue = np.concatenate((targetfi[:, np.newaxis], noisefi[:, np.newaxis]), axis=1)
+    ytrue = np.concatenate((target_fi[:, np.newaxis], noise_fi[:, np.newaxis]), axis=1)
     ypred = np.concatenate((target_est_wav[:, np.newaxis], noise_est_wav[:, np.newaxis]), axis=1)
     # # note: w/o scale is more pessimistic number
     # sa_sdr, _ = calc_sa_sdr(hypothesis=ypred, reference=ytrue)
-    target_stoi = stoi(targetfi, target_est_wav, 16000, extended=False)
+    target_stoi = stoi(target_fi, target_est_wav, 16000, extended=False)
     wsdr, wsdr_cc, wsdr_cw = calc_wsdr(hypothesis=ypred, reference=ytrue, with_log=True)
     # logger.debug(f'wsdr weight sum for mixid {mixid} = {np.sum(wsdr_cw)}.')
@@ -865,7 +855,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # Speech intelligibility measure - PESQ
     if int(mixdb.mixture(mixid).snr) > -99:
         # len = target_est_wav.shape[0]
-        pesq_speech, csig_tg, cbak_tg, covl_tg, sgsnr_tg = calc_speech_metrics(target_est_wav, targetfi)
+        pesq_speech, csig_tg, cbak_tg, covl_tg, sgsnr_tg = calc_speech_metrics(target_est_wav, target_fi)
         pesq_mixture, csig_mx, cbak_mx, covl_mx, sgsnr_mx = calc_speech_metrics(mixture, target)
         # pesq_speech_tst = calc_pesq(hypothesis=target_est_wav, reference=target)
         # pesq_mixture_tst = calc_pesq(hypothesis=mixture, reference=target)
@@ -884,23 +874,26 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         covl_mx = 0
         covl_tg = 0
-    # Calc WER
-    asr_tt = ''
-    asr_mx = ''
-    asr_tge = ''
-    if wer_method == 'none' or mixdb.mixture(mixid).snr == -99:  # noise only, ignore/reset target asr
+    # Calc ASR
+    asr_tt = None
+    asr_mx = None
+    asr_tge = None
+    if asr_method == 'none' or mixdb.mixture(mixid).snr == -99:  # noise only, ignore/reset target asr
         wer_mx = float('nan')
         wer_tge = float('nan')
         wer_pi = float('nan')
     else:
-        if MP_GLOBAL.mixdb.asr_manifests:
-            asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0]  # ignore mixup
-        else:
-            asr_tt = calc_asr(target, engine=wer_method, whisper_model_name=whisper_model).text  # target truth
+        asr_tt = MP_GLOBAL.mixdb.get_speech_metadata(mixid, 'text')[0]  # ignore mixup
+        if asr_tt is None:
+            asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text  # target truth
+        # if MP_GLOBAL.mixdb.asr_manifests:
+        #     asr_tt = MP_GLOBAL.mixdb.mixture_asr_data(mixid)[0]  # ignore mixup
+        # else:
+        #     asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text  # target truth
         if asr_tt:
-            asr_mx = calc_asr(mixture, engine=wer_method, whisper_model=whisper_model).text
-            asr_tge = calc_asr(target_est_wav, engine=wer_method, whisper_model=whisper_model).text
+            asr_mx = calc_asr(mixture, engine=asr_method, whisper_model_name=asr_model_name).text
+            asr_tge = calc_asr(target_est_wav, engine=asr_method, whisper_model_name=asr_model_name).text
             wer_mx = calc_wer(asr_mx, asr_tt).wer * 100  # mixture wer
             wer_tge = calc_wer(asr_tge, asr_tt).wer * 100  # target estimate wer
@@ -962,10 +955,10 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         print('', file=f)
         print(f'Target path: {mixdb.target_file(ti).name}', file=f)
         print(f'Noise path: {mixdb.noise_file(ni).name}', file=f)
-        if wer_method != 'none':
-            print(f'WER method: {wer_method} and whisper model (if used):  {whisper_model}', file=f)
+        if asr_method != 'none':
+            print(f'ASR method: {asr_method} and whisper model (if used):  {asr_model_name}', file=f)
             if mixdb.asr_manifests:
-                print(f'ASR truth from manifest:  {asr_tt}', file=f)
+                print(f'ASR truth from metadata:  {asr_tt}', file=f)
             else:
                 print(f'ASR truth from wer method:  {asr_tt}', file=f)
             print(f'ASR result for mixture:  {asr_mx}', file=f)
@@ -977,7 +970,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     if enable_wav:
         write_wav(name=base_name + '_mixture.wav', audio=float_to_int16(mixture))
         write_wav(name=base_name + '_target.wav', audio=float_to_int16(target))
-        # write_wav(name=base_name + '_targetfi.wav', audio=float_to_int16(targetfi))
+        # write_wav(name=base_name + '_target_fi.wav', audio=float_to_int16(target_fi))
         write_wav(name=base_name + '_noise.wav', audio=float_to_int16(noise))
         write_wav(name=base_name + '_target_est.wav', audio=float_to_int16(target_est_wav))
         write_wav(name=base_name + '_noise_est.wav', audio=float_to_int16(noise_est_wav))
@@ -992,7 +985,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # 8) Write out plot file
     if enable_plot:
         from matplotlib.backends.backend_pdf import PdfPages
-        plot_fname = base_name + '_metric_spenh.pdf'
+        plot_name = base_name + '_metric_spenh.pdf'
         # Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
         # Original size (frames, stride, num_bands), decimates in stride dimension only if step is > 1
@@ -1007,7 +1000,7 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         feat_sgram = feat_sgram[:, -step:, :]  # decimate,  Fx1xB
         feat_sgram = np.reshape(feat_sgram, (feat_sgram.shape[0] * feat_sgram.shape[1], feat_sgram.shape[2]))
-        with PdfPages(plot_fname) as pdf:
+        with PdfPages(plot_name) as pdf:
             # page1 we always have a mixture and prediction, target optional if truth provided
             tfunc_name = mixdb.target_file(1).truth_settings[0].function  # first target, assumes all have same
             if tfunc_name == 'mapped_snr_f':
@@ -1036,25 +1029,25 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
             tg_spec = 20 * np.log10(abs(target_f) + np.finfo(np.float32).eps)
             tg_est_spec = 20 * np.log10(abs(predict_complex) + np.finfo(np.float32).eps)
             # n_spec = np.reshape(n_spec,(n_spec.shape[0] * n_spec.shape[1], n_spec.shape[2]))
-            pdf.savefig(plot_epredtruth(predict=tg_est_spec,
-                                        predict_wav=target_est_wav,
-                                        truth_f=tg_spec,
-                                        truth_wav=targetfi,
-                                        metric=np.vstack((lerr_tg_frame, phd_frame)).T,
-                                        tp_title='speech estimate'))
+            pdf.savefig(plot_e_predict_truth(predict=tg_est_spec,
+                                             predict_wav=target_est_wav,
+                                             truth_f=tg_spec,
+                                             truth_wav=target_fi,
+                                             metric=np.vstack((lerr_tg_frame, phd_frame)).T,
+                                             tp_title='speech estimate'))
             # page 4 noise extraction
             n_spec = 20 * np.log10(abs(noise_f) + np.finfo(np.float32).eps)
             n_est_spec = 20 * np.log10(abs(noise_est_complex) + np.finfo(np.float32).eps)
-            pdf.savefig(plot_epredtruth(predict=n_est_spec,
-                                        predict_wav=noise_est_wav,
-                                        truth_f=n_spec,
-                                        truth_wav=noisefi,
-                                        metric=lerr_n_frame,
-                                        tp_title='noise estimate'))
+            pdf.savefig(plot_e_predict_truth(predict=n_est_spec,
+                                             predict_wav=noise_est_wav,
+                                             truth_f=n_spec,
+                                             truth_wav=noise_fi,
+                                             metric=lerr_n_frame,
+                                             tp_title='noise estimate'))
             # Plot error waveforms
-            # tg_err_wav = targetfi - target_est_wav
+            # tg_err_wav = target_fi - target_est_wav
             # tg_err_spec = 20*np.log10(np.abs(target_f - predict_complex))
         plt.close('all')
@@ -1072,14 +1065,14 @@ def main():
     verbose = args['--verbose']
     mixids = args['--mixid']
-    predict_location = args['PLOC']
-    wer_method = args['--wer-method'].lower()
+    asr_method = args['--asr-method'].lower()
+    asr_model_name = args['--model'].lower()
     truth_est_mode = args['--truth-est-mode']
     enable_plot = args['--plot']
     enable_wav = args['--wav']
     enable_summary = args['--summary']
+    predict_location = args['PLOC']
     truth_location = args['TLOC']
-    whisper_model = args['--whisper-model'].lower()
     import glob
     from os.path import basename
@@ -1103,19 +1096,19 @@ def main():
     if not isdir(predict_location):
         print(f'The specified predict location {predict_location} is not a valid subdirectory path, exiting ...')
-    # allpfiles = listdir(predict_location)
-    allpfiles = glob.glob(predict_location + "/*.h5")
+    # all_predict_files = listdir(predict_location)
+    all_predict_files = glob.glob(predict_location + "/*.h5")
     predict_logfile = glob.glob(predict_location + "/*predict.log")
-    predwav_mode = False
-    if len(allpfiles) <= 0 and not truth_est_mode:
-        allpfiles = glob.glob(predict_location + "/*.wav")  # check for wav files
-        if len(allpfiles) <= 0:
+    predict_wav_mode = False
+    if len(all_predict_files) <= 0 and not truth_est_mode:
+        all_predict_files = glob.glob(predict_location + "/*.wav")  # check for wav files
+        if len(all_predict_files) <= 0:
             print(f'Subdirectory {predict_location} has no .h5 or .wav files, exiting ...')
         else:
-            logger.info(f'Found {len(allpfiles)} prediction .wav files.')
-            predwav_mode = True
+            logger.info(f'Found {len(all_predict_files)} prediction .wav files.')
+            predict_wav_mode = True
     else:
-        logger.info(f'Found {len(allpfiles)} prediction .h5 files.')
+        logger.info(f'Found {len(all_predict_files)} prediction .h5 files.')
     if len(predict_logfile) == 0:
         logger.info(f'Warning, predict location {predict_location} has no prediction log files.')
@@ -1134,51 +1127,51 @@ def main():
     logger.info(f'Only running specified subset of {len(mixids)} mixtures')
     enable_asr_warmup = False
-    if wer_method == 'none':
+    if asr_method == 'none':
         fnb = 'metric_spenh_'
-    elif wer_method == 'google':
+    elif asr_method == 'google':
         fnb = 'metric_spenh_ggl_'
-        logger.info(f'WER enabled with method {wer_method}')
+        logger.info(f'ASR enabled with method {asr_method}')
         enable_asr_warmup = True
-    elif wer_method == 'deepgram':
+    elif asr_method == 'deepgram':
         fnb = 'metric_spenh_dgram_'
-        logger.info(f'WER enabled with method {wer_method}')
+        logger.info(f'ASR enabled with method {asr_method}')
         enable_asr_warmup = True
-    elif wer_method == 'aixplain_whisper':
-        fnb = 'metric_spenh_whspx_' + whisper_model + '_'
-        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+    elif asr_method == 'aixplain_whisper':
+        fnb = 'metric_spenh_whspx_' + asr_model_name + '_'
+        logger.info(f'ASR enabled with method {asr_method} and whisper model {asr_model_name}')
         enable_asr_warmup = True
-    elif wer_method == 'whisper':
-        fnb = 'metric_spenh_whspl_' + whisper_model + '_'
-        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+    elif asr_method == 'whisper':
+        fnb = 'metric_spenh_whspl_' + asr_model_name + '_'
+        logger.info(f'ASR enabled with method {asr_method} and whisper model {asr_model_name}')
         enable_asr_warmup = True
-    elif wer_method == 'aaware_whisper':
-        fnb = 'metric_spenh_whspaaw_' + whisper_model + '_'
-        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+    elif asr_method == 'aaware_whisper':
+        fnb = 'metric_spenh_whspaaw_' + asr_model_name + '_'
+        logger.info(f'ASR enabled with method {asr_method} and whisper model {asr_model_name}')
         enable_asr_warmup = True
-    elif wer_method == 'faster_whisper':
-        fnb = 'metric_spenh_fwhsp_' + whisper_model + '_'
-        logger.info(f'WER enabled with method {wer_method} and whisper model {whisper_model}')
+    elif asr_method == 'faster_whisper':
+        fnb = 'metric_spenh_fwhsp_' + asr_model_name + '_'
+        logger.info(f'ASR enabled with method {asr_method} and whisper model {asr_model_name}')
         enable_asr_warmup = True
     else:
-        logger.error(f'Unrecognized WER method: {wer_method}')
+        logger.error(f'Unrecognized ASR method: {asr_method}')
         return
     if enable_asr_warmup:
         DEFAULT_SPEECH = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
         audio = read_audio(DEFAULT_SPEECH)
         logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
-        asr_chk = calc_asr(audio, engine=wer_method, whisper_model_name=whisper_model)
+        asr_chk = calc_asr(audio, engine=asr_method, whisper_model_name=asr_model_name)
         logger.info(f'Warmup completed, results {asr_chk}')
     MP_GLOBAL.mixdb = mixdb
     MP_GLOBAL.predict_location = predict_location
-    MP_GLOBAL.predwav_mode = predwav_mode
+    MP_GLOBAL.predict_wav_mode = predict_wav_mode
     MP_GLOBAL.truth_est_mode = truth_est_mode
     MP_GLOBAL.enable_plot = enable_plot
     MP_GLOBAL.enable_wav = enable_wav
-    MP_GLOBAL.wer_method = wer_method
-    MP_GLOBAL.whisper_model = whisper_model
+    MP_GLOBAL.asr_method = asr_method
+    MP_GLOBAL.asr_model_name = asr_model_name
     # Individual mixtures use pandas print, set precision to 2 decimal places
     # pd.set_option('float_format', '{:.2f}'.format)
@@ -1255,7 +1248,7 @@ def main():
             ofname = join(predict_location, fnb + 'summary_truest.txt')
         with open(ofname, 'w') as f:
-            print(f'WER enabled with method {wer_method}, whisper model, if used: {whisper_model}', file=f)
+            print(f'ASR enabled with method {asr_method}, whisper model, if used: {asr_model_name}', file=f)
             print(f'Speech enhancement metrics avg over all {len(all_mtab1_sorted_nom99)} non -99 SNR mixtures:',
                   file=f)
             print(all_nom99_mean.to_frame().T.round(2).to_string(float_format=lambda x: "{:.2f}".format(x),
@@ -1318,7 +1311,7 @@ def main():
         label = f'Extraction statistics stats over {num_mix} mixtures:'
         pd.DataFrame([label]).to_csv(csv_name, **header_args)
         all_metrics_table_2.describe().round(2).to_csv(csv_name, **table_args)
-        label = f'WER enabled with method {wer_method}, whisper model, if used: {whisper_model}'
+        label = f'ASR enabled with method {asr_method}, whisper model, if used: {asr_model_name}'
         pd.DataFrame([label]).to_csv(csv_name, **header_args)
         if not truth_est_mode:

sonusai 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

sonusai 0.17.0py3-none-any.whl → 0.17.2py3-none-any.whl