PyPI - sonusai - Versions diffs - 0.18.2__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

sonusai 0.18.2py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sonusai/__init__.py +1 -0
sonusai/audiofe.py +1 -1
sonusai/calc_metric_spenh.py +32 -362
sonusai/data/genmixdb.yml +2 -0
sonusai/doc/doc.py +45 -4
sonusai/genmetrics.py +137 -109
sonusai/lsdb.py +2 -2
sonusai/metrics/__init__.py +4 -0
sonusai/metrics/calc_audio_stats.py +42 -0
sonusai/metrics/calc_pesq.py +12 -8
sonusai/metrics/calc_phase_distance.py +43 -0
sonusai/metrics/calc_snr_f.py +34 -0
sonusai/metrics/calc_speech.py +312 -0
sonusai/metrics/calc_wer.py +2 -3
sonusai/metrics/calc_wsdr.py +0 -59
sonusai/mixture/__init__.py +3 -2
sonusai/mixture/audio.py +6 -5
sonusai/mixture/config.py +13 -0
sonusai/mixture/constants.py +1 -0
sonusai/mixture/datatypes.py +33 -0
sonusai/mixture/generation.py +6 -2
sonusai/mixture/mixdb.py +261 -122
sonusai/mixture/soundfile_audio.py +8 -6
sonusai/mixture/sox_audio.py +16 -13
sonusai/mixture/torchaudio_audio.py +6 -4
sonusai/mixture/truth_functions/energy.py +40 -28
sonusai/mixture/truth_functions/target.py +0 -1
sonusai/utils/__init__.py +1 -1
sonusai/utils/asr.py +26 -39
sonusai/utils/asr_functions/aaware_whisper.py +3 -3
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/METADATA +1 -1
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/RECORD +34 -31
sonusai/mixture/mapped_snr_f.py +0 -100
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/WHEEL +0 -0
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/entry_points.txt +0 -0

sonusai/__init__.py CHANGED Viewed

@@ -10,6 +10,7 @@ commands_doc = """
    calc_metric_spenh            Run speech enhancement and analysis
    doc                          Documentation
    genft                        Generate feature and truth data
+   genmetrics                   Generate mixture metrics data
    genmix                       Generate mixture and truth data
    genmixdb                     Generate a mixture database
    gentcst                      Generate target configuration from a subdirectory tree

sonusai/audiofe.py CHANGED Viewed

@@ -142,7 +142,7 @@ def main() -> None:
         if hparams is None:
             logger.error(f'Error: ONNX model does not have required SonusAI hyperparameters, cannot proceed.')
             raise SystemExit(1)
-        feature_mode = hparams.feature
+        feature_mode = hparams['feature']
         in0name = sess_inputs[0].name
         in0type = sess_inputs[0].type
         out_names = [n.name for n in session.get_outputs()]

sonusai/calc_metric_spenh.py CHANGED Viewed

@@ -24,7 +24,7 @@ For whisper ASR methods, the possible models used in local processing (ASR = whi
     {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}
 but note most are very computationally demanding and can overwhelm/hang a local system.
-Outputs the following to PLOC (where id is mixd number 0:num_mixtures):
+Outputs the following to PLOC (where id is mixid number 0:num_mixtures):
     <id>_metric_spenh.txt
     If --plot:
@@ -74,6 +74,9 @@ from sonusai.mixture import Feature
 from sonusai.mixture import MixtureDatabase
 from sonusai.mixture import Predict
+DB_99 = np.power(10, 99 / 10)
+DB_N99 = np.power(10, -99 / 10)
 def signal_handler(_sig, _frame):
     import sys
@@ -122,298 +125,6 @@ def power_uncompress(spec):
     return real_uncompress + 1j * imag_uncompress
-def snr(clean_speech, processed_speech, sample_rate):
-    # Check the length of the clean and processed speech. Must be the same.
-    clean_length = len(clean_speech)
-    processed_length = len(processed_speech)
-    if clean_length != processed_length:
-        raise ValueError('Both Speech Files must be same length.')
-    overall_snr = 10 * np.log10(np.sum(np.square(clean_speech)) / np.sum(np.square(clean_speech - processed_speech)))
-    # Global Variables
-    win_length = round(30 * sample_rate / 1000)  # window length in samples
-    skip_rate = int(np.floor(win_length / 4))  # window skip in samples
-    min_snr = -10  # minimum SNR in dB
-    max_snr = 35  # maximum SNR in dB
-    # For each frame of input speech, calculate the Segmental SNR
-    num_frames = int(clean_length / skip_rate - (win_length / skip_rate))  # number of frames
-    start = 0  # starting sample
-    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
-    segmental_snr = np.empty(num_frames)
-    eps = np.spacing(1)
-    for frame_count in range(num_frames):
-        # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
-        clean_frame = clean_speech[start:start + win_length]
-        processed_frame = processed_speech[start:start + win_length]
-        clean_frame = np.multiply(clean_frame, window)
-        processed_frame = np.multiply(processed_frame, window)
-        # (2) Compute the Segmental SNR
-        signal_energy = np.sum(np.square(clean_frame))
-        noise_energy = np.sum(np.square(clean_frame - processed_frame))
-        segmental_snr[frame_count] = 10 * np.log10(signal_energy / (noise_energy + eps) + eps)
-        segmental_snr[frame_count] = max(segmental_snr[frame_count], min_snr)
-        segmental_snr[frame_count] = min(segmental_snr[frame_count], max_snr)
-        start = start + skip_rate
-    return overall_snr, segmental_snr
-def lp_coefficients(speech_frame, model_order):
-    # (1) Compute Autocorrelation Lags
-    win_length = np.size(speech_frame)
-    autocorrelation = np.empty(model_order + 1)
-    e = np.empty(model_order + 1)
-    for k in range(model_order + 1):
-        autocorrelation[k] = np.dot(speech_frame[0:win_length - k], speech_frame[k: win_length])
-    # (2) Levinson-Durbin
-    a = np.ones(model_order)
-    a_past = np.empty(model_order)
-    ref_coefficients = np.empty(model_order)
-    e[0] = autocorrelation[0]
-    for i in range(model_order):
-        a_past[0: i] = a[0: i]
-        sum_term = np.dot(a_past[0: i], autocorrelation[i:0:-1])
-        ref_coefficients[i] = (autocorrelation[i + 1] - sum_term) / e[i]
-        a[i] = ref_coefficients[i]
-        if i == 0:
-            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], ref_coefficients[i])
-        else:
-            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], ref_coefficients[i])
-        e[i + 1] = (1 - ref_coefficients[i] * ref_coefficients[i]) * e[i]
-    lp_params = np.concatenate((np.array([1]), -a))
-    return autocorrelation, ref_coefficients, lp_params
-def llr(clean_speech, processed_speech, sample_rate):
-    from scipy.linalg import toeplitz
-    # Check the length of the clean and processed speech.  Must be the same.
-    clean_length = np.size(clean_speech)
-    processed_length = np.size(processed_speech)
-    if clean_length != processed_length:
-        raise ValueError('Both speech files must be same length.')
-    # Global Variables
-    win_length = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
-    skip_rate = (np.floor(win_length / 4)).astype(int)  # window skip in samples
-    if sample_rate < 10000:
-        p = 10  # LPC Analysis Order
-    else:
-        p = 16  # this could vary depending on sampling frequency.
-    # For each frame of input speech, calculate the Log Likelihood Ratio
-    num_frames = int((clean_length - win_length) / skip_rate)  # number of frames
-    start = 0  # starting sample
-    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
-    distortion = np.empty(num_frames)
-    for frame_count in range(num_frames):
-        # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
-        clean_frame = clean_speech[start: start + win_length]
-        processed_frame = processed_speech[start: start + win_length]
-        clean_frame = np.multiply(clean_frame, window)
-        processed_frame = np.multiply(processed_frame, window)
-        # (2) Get the autocorrelation lags and LPC parameters used to compute the LLR measure.
-        r_clean, ref_clean, a_clean = lp_coefficients(clean_frame, p)
-        r_processed, ref_processed, a_processed = lp_coefficients(processed_frame, p)
-        # (3) Compute the LLR measure
-        numerator = np.dot(np.matmul(a_processed, toeplitz(r_clean)), a_processed)
-        denominator = np.dot(np.matmul(a_clean, toeplitz(r_clean)), a_clean)
-        distortion[frame_count] = np.log(numerator / denominator)
-        start = start + skip_rate
-    return distortion
-def wss(clean_speech, processed_speech, sample_rate):
-    from scipy.fftpack import fft
-    # Check the length of the clean and processed speech, which must be the same.
-    clean_length = np.size(clean_speech)
-    processed_length = np.size(processed_speech)
-    if clean_length != processed_length:
-        raise ValueError('Files must have same length.')
-    # Global variables
-    win_length = (np.round(30 * sample_rate / 1000)).astype(int)  # window length in samples
-    skip_rate = (np.floor(np.divide(win_length, 4))).astype(int)  # window skip in samples
-    max_freq = (np.divide(sample_rate, 2)).astype(int)  # maximum bandwidth
-    num_crit = 25  # number of critical bands
-    n_fft = (np.power(2, np.ceil(np.log2(2 * win_length)))).astype(int)
-    n_fft_by_2 = (np.multiply(0.5, n_fft)).astype(int)  # FFT size/2
-    k_max = 20.0  # value suggested by Klatt, pg 1280
-    k_loc_max = 1.0  # value suggested by Klatt, pg 1280
-    # Critical Band Filter Definitions (Center Frequency and Bandwidths in Hz)
-    cent_freq = np.array([50.0000, 120.000, 190.000, 260.000, 330.000, 400.000, 470.000,
-                          540.000, 617.372, 703.378, 798.717, 904.128, 1020.38, 1148.30,
-                          1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 2211.08, 2446.71,
-                          2701.97, 2978.04, 3276.17, 3597.63])
-    bandwidth = np.array([70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000,
-                          77.3724, 86.0056, 95.3398, 105.411, 116.256, 127.914, 140.423,
-                          153.823, 168.154, 183.457, 199.776, 217.153, 235.631, 255.255,
-                          276.072, 298.126, 321.465, 346.136])
-    bw_min = bandwidth[0]  # minimum critical bandwidth
-    # Set up the critical band filters.
-    # Note here that Gaussian-ly shaped filters are used.
-    # Also, the sum of the filter weights are equivalent for each critical band filter.
-    # Filter less than -30 dB and set to zero.
-    min_factor = np.exp(-30.0 / (2.0 * 2.303))  # -30 dB point of filter
-    crit_filter = np.empty((num_crit, n_fft_by_2))
-    for i in range(num_crit):
-        f0 = (cent_freq[i] / max_freq) * n_fft_by_2
-        bw = (bandwidth[i] / max_freq) * n_fft_by_2
-        norm_factor = np.log(bw_min) - np.log(bandwidth[i])
-        j = np.arange(n_fft_by_2)
-        crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
-        cond = np.greater(crit_filter[i, :], min_factor)
-        crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
-    # For each frame of input speech, calculate the Weighted Spectral Slope Measure
-    num_frames = int(clean_length / skip_rate - (win_length / skip_rate))  # number of frames
-    start = 0  # starting sample
-    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
-    distortion = np.empty(num_frames)
-    for frame_count in range(num_frames):
-        # (1) Get the Frames for the test and reference speech. Multiply by Hanning Window.
-        clean_frame = clean_speech[start: start + win_length] / 32768
-        processed_frame = processed_speech[start: start + win_length] / 32768
-        clean_frame = np.multiply(clean_frame, window)
-        processed_frame = np.multiply(processed_frame, window)
-        # (2) Compute the Power Spectrum of Clean and Processed
-        clean_spec = np.square(np.abs(fft(clean_frame, n_fft)))
-        processed_spec = np.square(np.abs(fft(processed_frame, n_fft)))
-        # (3) Compute Filterbank Output Energies (in dB scale)
-        clean_energy = np.matmul(crit_filter, clean_spec[0:n_fft_by_2])
-        processed_energy = np.matmul(crit_filter, processed_spec[0:n_fft_by_2])
-        clean_energy = 10 * np.log10(np.maximum(clean_energy, 1E-10))
-        processed_energy = 10 * np.log10(np.maximum(processed_energy, 1E-10))
-        # (4) Compute Spectral Slope (dB[i+1]-dB[i])
-        clean_slope = clean_energy[1:num_crit] - clean_energy[0: num_crit - 1]
-        processed_slope = processed_energy[1:num_crit] - processed_energy[0: num_crit - 1]
-        # (5) Find the nearest peak locations in the spectra to each critical band.
-        #     If the slope is negative, we search to the left. If positive, we search to the right.
-        clean_loc_peak = np.empty(num_crit - 1)
-        processed_loc_peak = np.empty(num_crit - 1)
-        for i in range(num_crit - 1):
-            # find the peaks in the clean speech signal
-            if clean_slope[i] > 0:  # search to the right
-                n = i
-                while (n < num_crit - 1) and (clean_slope[n] > 0):
-                    n = n + 1
-                clean_loc_peak[i] = clean_energy[n - 1]
-            else:  # search to the left
-                n = i
-                while (n >= 0) and (clean_slope[n] <= 0):
-                    n = n - 1
-                clean_loc_peak[i] = clean_energy[n + 1]
-            # find the peaks in the processed speech signal
-            if processed_slope[i] > 0:  # search to the right
-                n = i
-                while (n < num_crit - 1) and (processed_slope[n] > 0):
-                    n = n + 1
-                processed_loc_peak[i] = processed_energy[n - 1]
-            else:  # search to the left
-                n = i
-                while (n >= 0) and (processed_slope[n] <= 0):
-                    n = n - 1
-                processed_loc_peak[i] = processed_energy[n + 1]
-        # (6) Compute the WSS Measure for this frame. This includes determination of the weighting function.
-        db_max_clean = np.max(clean_energy)
-        db_max_processed = np.max(processed_energy)
-        '''
-        The weights are calculated by averaging individual weighting factors from the clean and processed frame.
-        These weights w_clean and w_processed should range from 0 to 1 and place more emphasis on spectral peaks
-        and less emphasis on slope differences in spectral valleys.
-        This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
-        '''
-        w_max_clean = np.divide(k_max, k_max + db_max_clean - clean_energy[0: num_crit - 1])
-        w_loc_max_clean = np.divide(k_loc_max, k_loc_max + clean_loc_peak - clean_energy[0: num_crit - 1])
-        w_clean = np.multiply(w_max_clean, w_loc_max_clean)
-        w_max_processed = np.divide(k_max, k_max + db_max_processed - processed_energy[0: num_crit - 1])
-        w_loc_max_processed = np.divide(k_loc_max, k_loc_max + processed_loc_peak - processed_energy[0: num_crit - 1])
-        w_processed = np.multiply(w_max_processed, w_loc_max_processed)
-        w = np.divide(np.add(w_clean, w_processed), 2.0)
-        slope_diff = np.subtract(clean_slope, processed_slope)[0: num_crit - 1]
-        distortion[frame_count] = np.dot(w, np.square(slope_diff)) / np.sum(w)
-        # This normalization is not part of Klatt's paper, but helps to normalize the measure.
-        # Here we scale the measure by the sum of the weights.
-        start = start + skip_rate
-    return distortion
-def calc_speech_metrics(hypothesis: np.ndarray,
-                        reference: np.ndarray) -> tuple[float, int, int, int, float]:
-    """
-    Calculate speech metrics pesq_mos, c_sig, c_bak, c_ovl, seg_snr.  These are all related and thus included
-    in one function. Reference: matlab script "compute_metrics.m".
-    Usage:
-        pesq, c_sig, c_bak, c_ovl, ssnr = compute_metrics(hypothesis, reference, fs, path)
-        reference: clean audio as array
-        hypothesis: enhanced audio as array
-        Audio must have sampling rate = 16000 Hz.
-    Example call:
-        pesq_output, csig_output, cbak_output, covl_output, ssnr_output = \
-                calc_speech_metrics(predicted_audio, target_audio)
-    """
-    from sonusai.metrics import calc_pesq
-    fs = 16000
-    # compute the WSS measure
-    wss_dist_vec = wss(reference, hypothesis, fs)
-    wss_dist_vec = np.sort(wss_dist_vec)
-    alpha = 0.95  # value from CMGAN ref implementation
-    wss_dist = np.mean(wss_dist_vec[0: round(np.size(wss_dist_vec) * alpha)])
-    # compute the LLR measure
-    llr_dist = llr(reference, hypothesis, fs)
-    ll_rs = np.sort(llr_dist)
-    llr_len = round(np.size(llr_dist) * alpha)
-    llr_mean = np.mean(ll_rs[0: llr_len])
-    # compute the SNRseg
-    snr_dist, segsnr_dist = snr(reference, hypothesis, fs)
-    seg_snr = np.mean(segsnr_dist)
-    # compute the pesq (use Sonusai wrapper, only fs=16k, mode=wb support)
-    pesq_mos = calc_pesq(hypothesis=hypothesis, reference=reference)
-    # now compute the composite measures
-    c_sig = 3.093 - 1.029 * llr_mean + 0.603 * pesq_mos - 0.009 * wss_dist
-    c_sig = max(1, c_sig)
-    c_sig = min(5, c_sig)  # limit values to [1, 5]
-    c_bak = 1.634 + 0.478 * pesq_mos - 0.007 * wss_dist + 0.063 * seg_snr
-    c_bak = max(1, c_bak)
-    c_bak = min(5, c_bak)  # limit values to [1, 5]
-    c_ovl = 1.594 + 0.805 * pesq_mos - 0.512 * llr_mean - 0.007 * wss_dist
-    c_ovl = max(1, c_ovl)
-    c_ovl = min(5, c_ovl)  # limit values to [1, 5]
-    return pesq_mos, c_sig, c_bak, c_ovl, seg_snr
 def mean_square_error(hypothesis: np.ndarray,
                       reference: np.ndarray,
                       squared: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
@@ -494,48 +205,6 @@ def log_error(reference: np.ndarray, hypothesis: np.ndarray) -> tuple[np.ndarray
     return err, err_b, err_f
-def phase_distance(reference: np.ndarray,
-                   hypothesis: np.ndarray,
-                   eps: float = 1e-9) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Calculate weighted phase distance error (weight normalization over bins per frame)
-    :param reference: complex [frames, bins]
-    :param hypothesis: complex [frames, bins]
-    :param eps: epsilon value
-    :return: mean, mean per bin, mean per frame
-    """
-    ang_diff = np.angle(reference) - np.angle(hypothesis)
-    phd_mod = (ang_diff + np.pi) % (2 * np.pi) - np.pi
-    rh_angle_diff = phd_mod * 180 / np.pi  # angle diff in deg
-    # Use complex divide to intrinsically keep angle diff +/-180 deg, but avoid div by zero (real hyp)
-    # hyp_real = np.real(hypothesis)
-    # near_zeros = np.real(hyp_real) < eps
-    # hyp_real = hyp_real * (np.logical_not(near_zeros))
-    # hyp_real = hyp_real + (near_zeros * eps)
-    # hypothesis = hyp_real + 1j*np.imag(hypothesis)
-    # rh_angle_diff = np.angle(reference / hypothesis) * 180 / np.pi  # angle diff +/-180
-    # weighted mean over all (scalar)
-    reference_mag = np.abs(reference)
-    ref_weight = reference_mag / (np.sum(reference_mag) + eps)  # frames x bins
-    err = np.around(np.sum(ref_weight * rh_angle_diff), 3)
-    # weighted mean over frames (value per bin)
-    err_b = np.zeros(reference.shape[1])
-    for bi in range(reference.shape[1]):
-        ref_weight = reference_mag[:, bi] / (np.sum(reference_mag[:, bi], axis=0) + eps)
-        err_b[bi] = np.around(np.sum(ref_weight * rh_angle_diff[:, bi]), 3)
-    # weighted mean over bins (value per frame)
-    err_f = np.zeros(reference.shape[0])
-    for fi in range(reference.shape[0]):
-        ref_weight = reference_mag[fi, :] / (np.sum(reference_mag[fi, :]) + eps)
-        err_f[fi] = np.around(np.sum(ref_weight * rh_angle_diff[fi, :]), 3)
-    return err, err_b, err_f
 def plot_mixpred(mixture: AudioT,
                  mixture_f: AudioF,
                  target: Optional[AudioT] = None,
@@ -543,7 +212,6 @@ def plot_mixpred(mixture: AudioT,
                  predict: Optional[Predict] = None,
                  tp_title: str = '') -> plt.Figure:
     from sonusai.mixture import SAMPLE_RATE
     num_plots = 2
     if feature is not None:
         num_plots += 1
@@ -706,12 +374,13 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     import h5py
     import mgzip
     from matplotlib.backends.backend_pdf import PdfPages
-    from numpy import inf
     from pystoi import stoi
     from sonusai import SonusAIError
     from sonusai import logger
     from sonusai.metrics import calc_pcm
+    from sonusai.metrics import calc_phase_distance
+    from sonusai.metrics import calc_speech
     from sonusai.metrics import calc_wer
     from sonusai.metrics import calc_wsdr
     from sonusai.mixture import forward_transform
@@ -771,24 +440,28 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # noise_wo_dist_f = mixdb.mixture_noise_f(mixid, noise=noise_wo_dist)
     noise = mixture - target  # has time-domain distortion (ir,etc.) but does not have specaugment
     # noise_f = mixdb.mixture_noise_f(mixid, noise=noise)
-    segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)  # note: uses pre-IR, pre-specaug audio
+    # note: uses pre-IR, pre-specaug audio
+    segsnr_f: np.ndarray = mixdb.mixture_metrics(mixid, ['ssnr'])[0]  # type: ignore
     mixture_f = mixdb.mixture_mixture_f(mixid, mixture=mixture)
     noise_f = mixture_f - target_f  # true noise in freq domain includes specaugment and time-domain ir,distortions
     # segsnr_f = mixdb.mixture_segsnr(mixid, target=target, noise=noise)
-    segsnr_f[segsnr_f == inf] = 7.944e8  # 99db
-    segsnr_f[segsnr_f == -inf] = 1.258e-10  # -99db
+    segsnr_f[segsnr_f == np.inf] = DB_99
+    # segsnr_f should never be -np.inf
+    segsnr_f[segsnr_f == -np.inf] = DB_N99
     # need to use inv-tf to match #samples & latency shift properties of predict inv tf
     target_fi = inverse_transform(target_f, mixdb.it_config)
     noise_fi = inverse_transform(noise_f, mixdb.it_config)
     # mixture_fi = mixdb.inverse_transform(mixture_f)
     # gen feature, truth - note feature only used for plots
-    # TBD parse truth_f for different formats and also multi-truth
-    feature, truth_f = mixdb.mixture_ft(mixid, mixture=mixture)
-    truth_type = mixdb.target_file(mixdb.mixture(mixid).targets[0].file_id).truth_settings[0].function
-    if truth_type == 'target_mixture_f':
-        half = truth_f.shape[-1] // 2
-        truth_f = truth_f[..., :half]  # extract target_f only
+    # TODO: parse truth_f for different formats
+    feature, truth_f = mixdb.mixture_ft(mixid, mixture_f=mixture_f)
+    # ignore mixup
+    for truth_setting in mixdb.target_file(mixdb.mixture(mixid).targets[0].file_id).truth_settings:
+        if truth_setting.function == 'target_mixture_f':
+            half = truth_f.shape[-1] // 2
+            # extract target_f only
+            truth_f = truth_f[..., :half]
     if not truth_est_mode:
         if predict.shape[0] < target_f.shape[0]:  # target_f, truth_f, mixture_f, etc. same size
@@ -843,12 +516,12 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     pcm, pcm_bin, pcm_frame = calc_pcm(hypothesis=ypred_f, reference=ytrue_f, with_log=True)
     # Phase distance
-    phd, phd_bin, phd_frame = phase_distance(hypothesis=predict_complex, reference=truth_f_complex)
+    phd, phd_bin, phd_frame = calc_phase_distance(hypothesis=predict_complex, reference=truth_f_complex)
     # Noise td logerr
     # lerr_nt, lerr_nt_bin, lerr_nt_frame = log_error(noise_fi, noise_truth_est_audio)
-    # # SA-SDR (time-domain source-aggragated SDR)
+    # # SA-SDR (time-domain source-aggregated SDR)
     ytrue = np.concatenate((target_fi[:, np.newaxis], noise_fi[:, np.newaxis]), axis=1)
     ypred = np.concatenate((target_est_wav[:, np.newaxis], noise_est_wav[:, np.newaxis]), axis=1)
     # # note: w/o scale is more pessimistic number
@@ -863,8 +536,8 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     # Speech intelligibility measure - PESQ
     if int(mixdb.mixture(mixid).snr) > -99:
         # len = target_est_wav.shape[0]
-        pesq_speech, csig_tg, cbak_tg, covl_tg, sgsnr_tg = calc_speech_metrics(target_est_wav, target_fi)
-        pesq_mixture, csig_mx, cbak_mx, covl_mx, sgsnr_mx = calc_speech_metrics(mixture, target)
+        pesq_speech, csig_tg, cbak_tg, covl_tg = calc_speech(target_est_wav, target_fi)
+        pesq_mixture, csig_mx, cbak_mx, covl_mx = mixdb.mixture_metrics(mixid, ['mxpesq', 'mxcsig', 'mxcbak', 'mxcovl'])
         # pesq_speech_tst = calc_pesq(hypothesis=target_est_wav, reference=target)
         # pesq_mixture_tst = calc_pesq(hypothesis=mixture, reference=target)
         # pesq improvement
@@ -886,20 +559,15 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
     asr_tt = None
     asr_mx = None
     asr_tge = None
-    if asr_method == 'none' or mixdb.mixture(mixid).snr == -99:  # noise only, ignore/reset target asr
-        wer_mx = float('nan')
-        wer_tge = float('nan')
-        wer_pi = float('nan')
-    else:
+    asr_engines = list(mixdb.asr_configs.keys())
+    if len(asr_engines) > 0 and mixdb.mixture(mixid).snr >= -96:  # noise only, ignore/reset target asr
+        wer_mx = float(mixdb.mixture_metrics(mixid, [f'mxwer.{asr_engines[0]}'])[0]) * 100
         asr_tt = MP_GLOBAL.mixdb.mixture_speech_metadata(mixid, 'text')[0]  # ignore mixup
         if asr_tt is None:
             asr_tt = calc_asr(target, engine=asr_method, whisper_model_name=asr_model_name).text  # target truth
         if asr_tt:
-            asr_mx = calc_asr(mixture, engine=asr_method, whisper_model_name=asr_model_name).text
             asr_tge = calc_asr(target_est_wav, engine=asr_method, whisper_model_name=asr_model_name).text
-            wer_mx = calc_wer(asr_mx, asr_tt).wer * 100  # mixture wer
             wer_tge = calc_wer(asr_tge, asr_tt).wer * 100  # target estimate wer
             if wer_mx == 0.0:
                 if wer_tge == 0.0:
@@ -913,6 +581,10 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
             wer_mx = float(0)
             wer_tge = float(0)
             wer_pi = float(0)
+    else:
+        wer_mx = float('nan')
+        wer_tge = float('nan')
+        wer_pi = float('nan')
     # 5) Save per mixture metric results
     # Single row in table of scalar metrics per mixture
@@ -1088,7 +760,6 @@ def main():
     from os.path import basename
     from os.path import isdir
     from os.path import join
-    from os.path import split
     import psutil
     from tqdm import tqdm
@@ -1097,7 +768,7 @@ def main():
     from sonusai import initial_log_messages
     from sonusai import logger
     from sonusai import update_console_handler
-    from sonusai.mixture import DEFAULT_NOISE
+    from sonusai.mixture import DEFAULT_SPEECH
     from sonusai.mixture import MixtureDatabase
     from sonusai.mixture import read_audio
     from sonusai.utils import calc_asr
@@ -1173,8 +844,7 @@ def main():
         return
     if enable_asr_warmup:
-        default_speech = split(DEFAULT_NOISE)[0] + '/speech_ma01_01.wav'
-        audio = read_audio(default_speech)
+        audio = read_audio(DEFAULT_SPEECH)
         logger.info(f'Warming up asr method, note for cloud service this could take up to a few min ...')
         asr_chk = calc_asr(audio, engine=asr_method, whisper_model_name=asr_model_name)
         logger.info(f'Warmup completed, results {asr_chk}')

sonusai/data/genmixdb.yml CHANGED Viewed

@@ -62,3 +62,5 @@ spectral_masks:
     t_max_width: 100
     t_num: 0
     t_max_percent: 100
+asr_configs: [ ]

sonusai/doc/doc.py CHANGED Viewed

@@ -199,7 +199,6 @@ def get_truth_functions() -> str:
 def doc_truth_settings() -> str:
     import yaml
-    from sonusai.mixture import get_default_config
     default = f"\nDefault value:\n\n{yaml.dump(get_default_config()['truth_settings'])}"
     return """
 'truth_settings' is a mixture database configuration parameter that sets the truth
@@ -375,7 +374,6 @@ This rule expands to 6 unique augmentations being applied to each target
 def doc_target_augmentations() -> str:
     import yaml
-    from sonusai.mixture import get_default_config
     default = f"\nDefault value:\n\n{yaml.dump(get_default_config()['target_augmentations'])}"
     return """
 'target_augmentations' is a mixture database configuration parameter that
@@ -388,7 +386,6 @@ See 'augmentations' for details on augmentation rules.
 def doc_class_balancing_augmentation() -> str:
     import yaml
-    from sonusai.mixture import get_default_config
     default = f"\nDefault value:\n\n{yaml.dump(get_default_config()['class_balancing_augmentation'])}"
     return """
 'class_balancing_augmentation' is a mixture database configuration parameter
@@ -436,7 +433,6 @@ Required field:
 def doc_noise_augmentations() -> str:
     import yaml
-    from sonusai.mixture import get_default_config
     default = f"\nDefault value:\n\n{yaml.dump(get_default_config()['noise_augmentations'])}"
     return """
 'noise_augmentations' is a mixture database configuration parameter that
@@ -536,3 +532,48 @@ def doc_config() -> str:
     for c in VALID_CONFIGS:
         text += f' {c}\n'
     return text
+def doc_asr_configs() -> str:
+    from sonusai.utils import get_available_engines
+    default = f"\nDefault value: {get_default_config()['asr_configs']}"
+    engines = get_available_engines()
+    text = """
+'asr_configs' is a mixture database configuration parameter that sets the list of
+ASR engine(s) to use.
+Required fields:
+ 'name'         Unique identifier for the ASR engine.
+ 'engine'       ASR engine to use. Available engines:
+"""
+    text += f'                {", ".join(engines)}\n'
+    text += """
+Optional fields:
+ 'model'        Some ASR engines allow the specification of a model, but note most are
+                very computationally demanding and can overwhelm/hang a local system.
+                Available whisper ASR engines:
+                    tiny.en, tiny, base.en, base, small.en, small, medium.en, medium, large-v1, large-v2, large
+ 'device'       Some ASR engines allow the specification of a device, either 'cpu' or 'cuda'.
+ 'cpu_threads'  Some ASR engines allow the specification of the number of CPU threads to use.
+ 'compute_type' Some ASR engines allow the specification of a compute type, e.g. 'int8'.
+ 'beam_size'    Some ASR engines allow the specification of a beam size.
+ <other>        Other parameters can be injected into the ASR engine as needed; all
+                fields in each config are forwarded to the given engine.
+Example:
+asr_configs:
+  - name: faster_tiny_cuda
+    engine: faster_whisper
+    model: tiny
+    device: cuda
+    beam_size: 5
+  - name: google
+    engine: google
+Creates two ASR engines for use named faster_tiny_cuda and google.
+"""
+    return text + default

sonusai 0.18.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

sonusai 0.18.2py3-none-any.whl → 0.18.4py3-none-any.whl