PyPI - sonusai - Versions diffs - 0.18.1__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

sonusai 0.18.1py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sonusai/__init__.py +1 -0
sonusai/audiofe.py +1 -1
sonusai/calc_metric_spenh.py +32 -362
sonusai/data/genmixdb.yml +2 -0
sonusai/doc/doc.py +45 -4
sonusai/genmetrics.py +137 -109
sonusai/lsdb.py +2 -2
sonusai/metrics/__init__.py +4 -0
sonusai/metrics/calc_audio_stats.py +42 -0
sonusai/metrics/calc_pesq.py +12 -8
sonusai/metrics/calc_phase_distance.py +43 -0
sonusai/metrics/calc_snr_f.py +34 -0
sonusai/metrics/calc_speech.py +312 -0
sonusai/metrics/calc_wer.py +2 -3
sonusai/metrics/calc_wsdr.py +0 -59
sonusai/mixture/__init__.py +3 -2
sonusai/mixture/audio.py +6 -5
sonusai/mixture/config.py +13 -0
sonusai/mixture/constants.py +1 -0
sonusai/mixture/datatypes.py +33 -0
sonusai/mixture/generation.py +6 -2
sonusai/mixture/mixdb.py +284 -148
sonusai/mixture/soundfile_audio.py +8 -6
sonusai/mixture/sox_audio.py +16 -13
sonusai/mixture/torchaudio_audio.py +6 -4
sonusai/mixture/truth_functions/energy.py +40 -28
sonusai/mixture/truth_functions/target.py +0 -1
sonusai/utils/__init__.py +1 -1
sonusai/utils/asr.py +26 -39
sonusai/utils/asr_functions/aaware_whisper.py +3 -3
{sonusai-0.18.1.dist-info → sonusai-0.18.4.dist-info}/METADATA +1 -1
{sonusai-0.18.1.dist-info → sonusai-0.18.4.dist-info}/RECORD +34 -31
sonusai/mixture/mapped_snr_f.py +0 -100
{sonusai-0.18.1.dist-info → sonusai-0.18.4.dist-info}/WHEEL +0 -0
{sonusai-0.18.1.dist-info → sonusai-0.18.4.dist-info}/entry_points.txt +0 -0

sonusai/metrics/calc_speech.py ADDED Viewed

@@ -0,0 +1,312 @@
+import numpy as np
+from sonusai.mixture.constants import SAMPLE_RATE
+from sonusai.mixture.datatypes import SpeechMetrics
+from .calc_pesq import calc_pesq
+def calc_speech(hypothesis: np.ndarray, reference: np.ndarray, sample_rate: int = SAMPLE_RATE) -> SpeechMetrics:
+    """Calculate speech metrics pesq, c_sig, c_bak, c_ovl, seg_snr.
+    These are all related and thus included in one function. Reference: matlab script "compute_metrics.m".
+    :param hypothesis: estimated audio
+    :param reference: reference audio
+    :param sample_rate: sample rate of audio
+    :return: SpeechMetrics named tuple
+    """
+    # Weighted spectral slope measure
+    wss_dist_vec = _calc_weighted_spectral_slope_measure(hypothesis=hypothesis, reference=reference)
+    wss_dist_vec = np.sort(wss_dist_vec)
+    # Value from CMGAN reference implementation
+    alpha = 0.95
+    wss_dist = np.mean(wss_dist_vec[0: round(np.size(wss_dist_vec) * alpha)])
+    # Log likelihood ratio measure
+    llr_dist = _calc_log_likelihood_ratio_measure(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
+    ll_rs = np.sort(llr_dist)
+    llr_len = round(np.size(llr_dist) * alpha)
+    llr_mean = np.mean(ll_rs[:llr_len])
+    # Segmental SNR
+    snr_dist, segsnr_dist = _calc_snr(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
+    seg_snr = np.mean(segsnr_dist)
+    # PESQ
+    _pesq = calc_pesq(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
+    # Now compute the composite measures
+    c_sig = np.clip(3.093 - 1.029 * llr_mean + 0.603 * _pesq - 0.009 * wss_dist, 1, 5)
+    c_bak = np.clip(1.634 + 0.478 * _pesq - 0.007 * wss_dist + 0.063 * seg_snr, 1, 5)
+    c_ovl = np.clip(1.594 + 0.805 * _pesq - 0.512 * llr_mean - 0.007 * wss_dist, 1, 5)
+    return SpeechMetrics(_pesq, c_sig, c_bak, c_ovl)
+def _calc_weighted_spectral_slope_measure(hypothesis: np.ndarray,
+                                          reference: np.ndarray,
+                                          sample_rate: int = SAMPLE_RATE) -> np.ndarray:
+    from scipy.fftpack import fft
+    # The lengths of the reference and hypothesis must be the same.
+    reference_length = np.size(reference)
+    hypothesis_length = np.size(hypothesis)
+    if reference_length != hypothesis_length:
+        raise ValueError('Hypothesis and reference must be the same length.')
+    # Window length in samples
+    win_length = int(np.round(30 * sample_rate / 1000))
+    # Window skip in samples
+    skip_rate = int(np.floor(np.divide(win_length, 4)))
+    # Maximum bandwidth
+    max_freq = int(np.divide(sample_rate, 2))
+    num_crit = 25
+    n_fft = int(np.power(2, np.ceil(np.log2(2 * win_length))))
+    n_fft_by_2 = int(np.multiply(0.5, n_fft))
+    # Value suggested by Klatt, pg 1280
+    k_max = 20.0
+    # Value suggested by Klatt, pg 1280
+    k_loc_max = 1.0
+    # Critical band filter definitions (center frequency and bandwidths in Hz)
+    cent_freq = np.array([50.0000, 120.000, 190.000, 260.000, 330.000, 400.000, 470.000,
+                          540.000, 617.372, 703.378, 798.717, 904.128, 1020.38, 1148.30,
+                          1288.72, 1442.54, 1610.70, 1794.16, 1993.93, 2211.08, 2446.71,
+                          2701.97, 2978.04, 3276.17, 3597.63])
+    bandwidth = np.array([70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000, 70.0000,
+                          77.3724, 86.0056, 95.3398, 105.411, 116.256, 127.914, 140.423,
+                          153.823, 168.154, 183.457, 199.776, 217.153, 235.631, 255.255,
+                          276.072, 298.126, 321.465, 346.136])
+    # Minimum critical bandwidth
+    bw_min = bandwidth[0]
+    # Set up the critical band filters.
+    # Note here that Gaussian-ly shaped filters are used.
+    # Also, the sum of the filter weights are equivalent for each critical band filter.
+    # Filter less than -30 dB and set to zero.
+    # -30 dB point of filter
+    min_factor = np.exp(-30.0 / (2.0 * 2.303))
+    crit_filter = np.empty((num_crit, n_fft_by_2))
+    for i in range(num_crit):
+        f0 = (cent_freq[i] / max_freq) * n_fft_by_2
+        bw = (bandwidth[i] / max_freq) * n_fft_by_2
+        norm_factor = np.log(bw_min) - np.log(bandwidth[i])
+        j = np.arange(n_fft_by_2)
+        crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
+        cond = np.greater(crit_filter[i, :], min_factor)
+        crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
+    # For each frame of input speech, calculate the weighted spectral slope measure
+    num_frames = int(reference_length / skip_rate - (win_length / skip_rate))
+    start = 0
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
+    distortion = np.empty(num_frames)
+    for frame_count in range(num_frames):
+        # (1) Get the frames for the test and reference speech. Multiply by Hanning window.
+        reference_frame = reference[start: start + win_length] / 32768
+        hypothesis_frame = hypothesis[start: start + win_length] / 32768
+        reference_frame = np.multiply(reference_frame, window)
+        hypothesis_frame = np.multiply(hypothesis_frame, window)
+        # (2) Compute the power spectrum of reference and hypothesis
+        reference_spec = np.square(np.abs(fft(reference_frame, n_fft)))
+        hypothesis_spec = np.square(np.abs(fft(hypothesis_frame, n_fft)))
+        # (3) Compute filter bank output energies (in dB scale)
+        reference_energy = np.matmul(crit_filter, reference_spec[0:n_fft_by_2])
+        hypothesis_energy = np.matmul(crit_filter, hypothesis_spec[0:n_fft_by_2])
+        reference_energy = 10 * np.log10(np.maximum(reference_energy, 1E-10))
+        hypothesis_energy = 10 * np.log10(np.maximum(hypothesis_energy, 1E-10))
+        # (4) Compute spectral slope (dB[i+1]-dB[i])
+        reference_slope = reference_energy[1:num_crit] - reference_energy[0: num_crit - 1]
+        hypothesis_slope = hypothesis_energy[1:num_crit] - hypothesis_energy[0: num_crit - 1]
+        # (5) Find the nearest peak locations in the spectra to each critical band.
+        #     If the slope is negative, we search to the left. If positive, we search to the right.
+        reference_loc_peak = np.empty(num_crit - 1)
+        hypothesis_loc_peak = np.empty(num_crit - 1)
+        for i in range(num_crit - 1):
+            # find the peaks in the reference speech signal
+            if reference_slope[i] > 0:
+                # search to the right
+                n = i
+                while (n < num_crit - 1) and (reference_slope[n] > 0):
+                    n = n + 1
+                reference_loc_peak[i] = reference_energy[n - 1]
+            else:
+                # search to the left
+                n = i
+                while (n >= 0) and (reference_slope[n] <= 0):
+                    n = n - 1
+                reference_loc_peak[i] = reference_energy[n + 1]
+            # find the peaks in the hypothesis speech signal
+            if hypothesis_slope[i] > 0:
+                # search to the right
+                n = i
+                while (n < num_crit - 1) and (hypothesis_slope[n] > 0):
+                    n = n + 1
+                hypothesis_loc_peak[i] = hypothesis_energy[n - 1]
+            else:
+                # search to the left
+                n = i
+                while (n >= 0) and (hypothesis_slope[n] <= 0):
+                    n = n - 1
+                hypothesis_loc_peak[i] = hypothesis_energy[n + 1]
+        # (6) Compute the weighted spectral slope measure for this frame.
+        #     This includes determination of the weighting function.
+        db_max_reference = np.max(reference_energy)
+        db_max_hypothesis = np.max(hypothesis_energy)
+        # The weights are calculated by averaging individual weighting factors from the reference and hypothesis frame.
+        # These weights w_reference and w_hypothesis should range from 0 to 1 and place more emphasis on spectral peaks
+        # and less emphasis on slope differences in spectral valleys.
+        # This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
+        w_max_reference = np.divide(k_max, k_max + db_max_reference - reference_energy[0: num_crit - 1])
+        w_loc_max_reference = np.divide(k_loc_max, k_loc_max + reference_loc_peak - reference_energy[0: num_crit - 1])
+        w_reference = np.multiply(w_max_reference, w_loc_max_reference)
+        w_max_hypothesis = np.divide(k_max, k_max + db_max_hypothesis - hypothesis_energy[0: num_crit - 1])
+        w_loc_max_hypothesis = np.divide(k_loc_max,
+                                         k_loc_max + hypothesis_loc_peak - hypothesis_energy[0: num_crit - 1])
+        w_hypothesis = np.multiply(w_max_hypothesis, w_loc_max_hypothesis)
+        w = np.divide(np.add(w_reference, w_hypothesis), 2.0)
+        slope_diff = np.subtract(reference_slope, hypothesis_slope)[0: num_crit - 1]
+        distortion[frame_count] = np.dot(w, np.square(slope_diff)) / np.sum(w)
+        # This normalization is not part of Klatt's paper, but helps to normalize the measure.
+        # Here we scale the measure by the sum of the weights.
+        start = start + skip_rate
+    return distortion
+def _calc_log_likelihood_ratio_measure(hypothesis: np.ndarray,
+                                       reference: np.ndarray,
+                                       sample_rate: int = SAMPLE_RATE) -> np.ndarray:
+    from scipy.linalg import toeplitz
+    # The lengths of the reference and hypothesis must be the same.
+    reference_length = np.size(reference)
+    hypothesis_length = np.size(hypothesis)
+    if reference_length != hypothesis_length:
+        raise ValueError('Hypothesis and reference must be the same length.')
+    # window length in samples
+    win_length = int(np.round(30 * sample_rate / 1000))
+    # window skip in samples
+    skip_rate = int(np.floor(win_length / 4))
+    # LPC analysis order; this could vary depending on sampling frequency.
+    if sample_rate < 10000:
+        p = 10
+    else:
+        p = 16
+    # For each frame of input speech, calculate the log likelihood ratio
+    num_frames = int((reference_length - win_length) / skip_rate)
+    start = 0
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
+    distortion = np.empty(num_frames)
+    for frame_count in range(num_frames):
+        # (1) Get the frames for the test and reference speech. Multiply by Hanning window.
+        reference_frame = reference[start: start + win_length]
+        hypothesis_frame = hypothesis[start: start + win_length]
+        reference_frame = np.multiply(reference_frame, window)
+        hypothesis_frame = np.multiply(hypothesis_frame, window)
+        # (2) Get the autocorrelation lags and LPC parameters used to compute the log likelihood ratio measure.
+        r_reference, ref_reference, a_reference = _lp_coefficients(reference_frame, p)
+        r_hypothesis, ref_hypothesis, a_hypothesis = _lp_coefficients(hypothesis_frame, p)
+        # (3) Compute the log likelihood ratio measure
+        numerator = np.dot(np.matmul(a_hypothesis, toeplitz(r_reference)), a_hypothesis)
+        denominator = np.dot(np.matmul(a_reference, toeplitz(r_reference)), a_reference)
+        distortion[frame_count] = np.log(numerator / denominator)
+        start = start + skip_rate
+    return distortion
+def _calc_snr(hypothesis: np.ndarray,
+              reference: np.ndarray,
+              sample_rate: int = SAMPLE_RATE) -> tuple[float, np.ndarray]:
+    # The lengths of the reference and hypothesis must be the same.
+    reference_length = len(reference)
+    hypothesis_length = len(hypothesis)
+    if reference_length != hypothesis_length:
+        raise ValueError('Hypothesis and reference must be the same length.')
+    overall_snr = 10 * np.log10(np.sum(np.square(reference)) / np.sum(np.square(reference - hypothesis)))
+    # window length in samples
+    win_length = round(30 * sample_rate / 1000)
+    # window skip in samples
+    skip_rate = int(np.floor(win_length / 4))
+    # minimum SNR in dB
+    min_snr = -10
+    # maximum SNR in dB
+    max_snr = 35
+    # For each frame of input speech, calculate the segmental SNR
+    num_frames = int(reference_length / skip_rate - (win_length / skip_rate))
+    start = 0
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
+    segmental_snr = np.empty(num_frames)
+    eps = np.spacing(1)
+    for frame_count in range(num_frames):
+        # (1) Get the frames for the test and reference speech. Multiply by Hanning window.
+        reference_frame = reference[start:start + win_length]
+        hypothesis_frame = hypothesis[start:start + win_length]
+        reference_frame = np.multiply(reference_frame, window)
+        hypothesis_frame = np.multiply(hypothesis_frame, window)
+        # (2) Compute the segmental SNR
+        signal_energy = np.sum(np.square(reference_frame))
+        noise_energy = np.sum(np.square(reference_frame - hypothesis_frame))
+        segmental_snr[frame_count] = np.clip(10 * np.log10(signal_energy / (noise_energy + eps) + eps),
+                                             min_snr,
+                                             max_snr)
+        start = start + skip_rate
+    return overall_snr, segmental_snr
+def _lp_coefficients(speech_frame, model_order):
+    # (1) Compute autocorrelation lags
+    win_length = np.size(speech_frame)
+    autocorrelation = np.empty(model_order + 1)
+    e = np.empty(model_order + 1)
+    for k in range(model_order + 1):
+        autocorrelation[k] = np.dot(speech_frame[0:win_length - k], speech_frame[k: win_length])
+    # (2) Levinson-Durbin
+    a = np.ones(model_order)
+    a_past = np.empty(model_order)
+    ref_coefficients = np.empty(model_order)
+    e[0] = autocorrelation[0]
+    for i in range(model_order):
+        a_past[0: i] = a[0: i]
+        sum_term = np.dot(a_past[0: i], autocorrelation[i:0:-1])
+        ref_coefficients[i] = (autocorrelation[i + 1] - sum_term) / e[i]
+        a[i] = ref_coefficients[i]
+        if i == 0:
+            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1:-1:-1], ref_coefficients[i])
+        else:
+            a[0: i] = a_past[0: i] - np.multiply(a_past[i - 1::-1], ref_coefficients[i])
+        e[i + 1] = (1 - ref_coefficients[i] * ref_coefficients[i]) * e[i]
+    lp_params = np.concatenate((np.array([1]), -a))
+    return autocorrelation, ref_coefficients, lp_params

sonusai/metrics/calc_wer.py CHANGED Viewed

@@ -1,8 +1,7 @@
-from dataclasses import dataclass
+from typing import NamedTuple
-@dataclass(frozen=True)
-class WerResult:
+class WerResult(NamedTuple):
     wer: float
     words: int
     substitutions: float

sonusai/metrics/calc_wsdr.py CHANGED Viewed

@@ -52,62 +52,3 @@ def calc_wsdr(hypothesis: np.ndarray,
             wsdr = 10 * np.log10(-1 / (wsdr - 1 - 1e-7))  # range -3 --> inf (or 1e-7 limit of 70db)
     return float(wsdr), cc, cw
-    # From calc_sa_sdr:
-    # These should include a noise to be a complete mixture estimate, i.e.,
-    #     noise_est = sum-over-all-srcs(s_est(0:nsamples, :) - sum-over-non-noisesrc(s_est(0:nsamples, n))
-    # should be one of the sources in reference (s_true) and hypothesis (s_est).
-    #
-    # Calculates -10*log10(sumn(||sn||^2) / sumn(||sn - shn||^2)
-    # Note: for SA method, sums are done independently on ref and error before division, vs. SDR and SI-SDR
-    # where sum over n is taken after divide (before log).  This is more stable in noise-only cases and also
-    # when some sources are poorly estimated.
-    # TBD: add soft-max option with eps and tau params
-    #
-    # if with_scale:
-    #     # calc 1 x nsrc scaling factors
-    #     ref_energy = np.sum(reference ** 2, axis=0, keepdims=True)
-    #     # if ref_energy is zero, just set scaling to 1.0
-    #     with np.errstate(divide='ignore', invalid='ignore'):
-    #         opt_scale = np.sum(reference * hypothesis, axis=0, keepdims=True) / ref_energy
-    #         opt_scale[opt_scale == np.inf] = 1.0
-    #         opt_scale = np.nan_to_num(opt_scale, nan=1.0)
-    #     scaled_ref = opt_scale * reference
-    # else:
-    #     scaled_ref = reference
-    #     opt_scale = np.ones((1, reference.shape[1]), dtype=float)
-    #
-    # # Calculate Lsdr = −<y,yˆ>/∥y∥∥yˆ∥ always in range [1 --> -1], size [batch,]
-    # t_tru_sq = torch.sum(torch.square(t_tru), -1)
-    # t_denom = torch.sqrt(t_tru_sq) * torch.sqrt(torch.sum(torch.square(t_est), -1)) + 1e-7
-    # t_wsdr = -torch.divide(torch.sum(torch.multiply(t_tru, t_est), -1), t_denom)
-    # n_tru_sq = torch.sum(torch.square(n_tru), -1)
-    # n_denom = torch.sqrt(torch.sum(torch.square(n_tru), -1)) \
-    #           * torch.sqrt(torch.sum(torch.square(n_est), -1)) + 1e-7
-    # n_wsdr = -torch.divide(torch.sum(torch.multiply(n_tru, n_est), -1), n_denom)
-    # if self.cl_noise_wght > 0:
-    #     wsdr = self.cl_target_wght * t_wsdr + self.cl_noise_wght * n_wsdr
-    # else:  # adaptive per relative strength of target vs noise:  α = ||y||2/(||y||2 +||z||2)
-    #     tweight = torch.divide(t_tru_sq, t_tru_sq + n_tru_sq + 1e-7)  # energy ratio target vs. noise
-    #     wsdr = tweight * t_wsdr + (1 - tweight) * n_wsdr
-    # wsdr = torch.mean(wsdr)  # reduction to scalar
-    #
-    # # multisrc sa-sdr, inputs must be [samples, nsrc]
-    # err = scaled_ref - hypothesis
-    #
-    # # -10*log10(sumk(||sk||^2) / sumk(||sk - shk||^2)
-    # # sum over samples and sources
-    # num = np.sum(reference ** 2)
-    # den = np.sum(err ** 2)
-    # if num == 0 and den == 0:
-    #     ratio = np.inf
-    # else:
-    #     ratio = num / (den + np.finfo(np.float32).eps)
-    #
-    # sa_sdr = 10 * np.log10(ratio)
-    #
-    # if with_negate:
-    #     # for use as a loss function
-    #     sa_sdr = -sa_sdr
-    #
-    # return sa_sdr, opt_scale

sonusai/mixture/__init__.py CHANGED Viewed

@@ -44,6 +44,7 @@ from .constants import VALID_CONFIGS
 from .constants import VALID_NOISE_MIX_MODES
 from .constants import VALID_TRUTH_SETTINGS
 from .datatypes import AudioF
+from .datatypes import AudioStatsMetrics
 from .datatypes import AudioT
 from .datatypes import AudiosF
 from .datatypes import AudiosT
@@ -72,9 +73,11 @@ from .datatypes import NoiseFile
 from .datatypes import NoiseFiles
 from .datatypes import Predict
 from .datatypes import Segsnr
+from .datatypes import SnrFMetrics
 from .datatypes import SpectralMask
 from .datatypes import SpectralMasks
 from .datatypes import SpeechMetadata
+from .datatypes import SpeechMetrics
 from .datatypes import TargetFile
 from .datatypes import TargetFiles
 from .datatypes import TransformConfig
@@ -113,8 +116,6 @@ from .helpers import read_mixture_data
 from .helpers import write_mixture_data
 from .helpers import write_mixture_metadata
 from .log_duration_and_sizes import log_duration_and_sizes
-from .mapped_snr_f import calculate_mapped_snr_f
-from .mapped_snr_f import calculate_snr_f_statistics
 from .mixdb import MixtureDatabase
 from .mixdb import db_file
 from .sox_audio import Transformer

sonusai/mixture/audio.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from functools import lru_cache
+from pathlib import Path
 from sonusai.mixture.datatypes import AudioT
 from sonusai.mixture.datatypes import ImpulseResponseData
@@ -28,7 +29,7 @@ def get_duration(audio: AudioT) -> float:
     return len(audio) / SAMPLE_RATE
-def validate_input_file(input_filepath: str) -> None:
+def validate_input_file(input_filepath: str | Path) -> None:
     from os.path import exists
     from os.path import splitext
@@ -46,7 +47,7 @@ def validate_input_file(input_filepath: str) -> None:
 @lru_cache
-def get_sample_rate(name: str) -> int:
+def get_sample_rate(name: str | Path) -> int:
     """Get sample rate from audio file
     :param name: File name
@@ -58,7 +59,7 @@ def get_sample_rate(name: str) -> int:
 @lru_cache
-def read_audio(name: str) -> AudioT:
+def read_audio(name: str | Path) -> AudioT:
     """Read audio data from a file
     :param name: File name
@@ -70,7 +71,7 @@ def read_audio(name: str) -> AudioT:
 @lru_cache
-def read_ir(name: str) -> ImpulseResponseData:
+def read_ir(name: str | Path) -> ImpulseResponseData:
     """Read impulse response data
     :param name: File name
@@ -82,7 +83,7 @@ def read_ir(name: str) -> ImpulseResponseData:
 @lru_cache
-def get_num_samples(name: str) -> int:
+def get_num_samples(name: str | Path) -> int:
     """Get the number of samples resampled to the SonusAI sample rate in the given file
     :param name: File name

sonusai/mixture/config.py CHANGED Viewed

@@ -90,6 +90,19 @@ def update_config_from_file(name: str, config: dict) -> dict:
     updated_config['truth_settings'] = update_truth_settings(updated_config['truth_settings'], default)
+    # Handle 'asr_configs' special case
+    if 'asr_configs' in updated_config:
+        asr_configs = {}
+        for asr_config in updated_config['asr_configs']:
+            asr_name = asr_config.get('name', None)
+            asr_engine = asr_config.get('engine', None)
+            if asr_name is None or asr_engine is None:
+                raise SonusAIError(f'Invalid config parameter in {name}: asr_configs.\n'
+                                   f'asr_configs must contain both name and engine.')
+            del asr_config['name']
+            asr_configs[asr_name] = asr_config
+        updated_config['asr_configs'] = asr_configs
     # Check for required keys
     for key in REQUIRED_CONFIGS:
         if key not in updated_config:

sonusai/mixture/constants.py CHANGED Viewed

@@ -4,6 +4,7 @@ from importlib.resources import as_file
 from importlib.resources import files
 REQUIRED_CONFIGS = [
+    'asr_configs',
     'class_balancing',
     'class_balancing_augmentation',
     'class_labels',

sonusai/mixture/datatypes.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from dataclasses import dataclass
+from typing import Any
+from typing import NamedTuple
 from typing import Optional
 from typing import TypeAlias
@@ -309,8 +311,12 @@ class FeatureGeneratorInfo:
     it_config: TransformConfig
+ASRConfigs: TypeAlias = dict[str, dict[str, Any]]
 @dataclass
 class MixtureDatabaseConfig(DataClassSonusAIMixin):
+    asr_configs: Optional[ASRConfigs] = None
     class_balancing: Optional[bool] = False
     class_labels: Optional[list[str]] = None
     class_weights_threshold: Optional[list[float]] = None
@@ -327,3 +333,30 @@ class MixtureDatabaseConfig(DataClassSonusAIMixin):
 SpeechMetadata: TypeAlias = str | list[Interval] | None
+class SnrFMetrics(NamedTuple):
+    mean: Optional[float] = None
+    var: Optional[float] = None
+    db_mean: Optional[float] = None
+    db_std: Optional[float] = None
+class SpeechMetrics(NamedTuple):
+    pesq: Optional[float] = None
+    c_sig: Optional[float] = None
+    c_bak: Optional[float] = None
+    c_ovl: Optional[float] = None
+class AudioStatsMetrics(NamedTuple):
+    dco: Optional[float] = None
+    min: Optional[float] = None
+    max: Optional[float] = None
+    pkdb: Optional[float] = None
+    lrms: Optional[float] = None
+    pkr: Optional[float] = None
+    tr: Optional[float] = None
+    cr: Optional[float] = None
+    fl: Optional[float] = None
+    pkc: Optional[float] = None

sonusai/mixture/generation.py CHANGED Viewed

@@ -59,6 +59,7 @@ def initialize_db(location: str, test: bool = False) -> None:
     CREATE TABLE top (
     id INTEGER PRIMARY KEY NOT NULL,
     version INTEGER NOT NULL,
+    asr_configs TEXT NOT NULL,
     class_balancing BOOLEAN NOT NULL,
     feature TEXT NOT NULL,
     noise_mix_mode TEXT NOT NULL,
@@ -149,6 +150,8 @@ def initialize_db(location: str, test: bool = False) -> None:
 def populate_top_table(location: str, config: dict, test: bool = False) -> None:
     """Populate top table
     """
+    import json
     from sonusai import SonusAIError
     from .mixdb import db_connection
@@ -158,11 +161,12 @@ def populate_top_table(location: str, config: dict, test: bool = False) -> None:
     con = db_connection(location=location, readonly=False, test=test)
     con.execute("""
-    INSERT INTO top (version, class_balancing, feature, noise_mix_mode, num_classes,
+    INSERT INTO top (version, asr_configs, class_balancing, feature, noise_mix_mode, num_classes,
     seed, truth_mutex, truth_reduction_function, mixid_width, speaker_metadata_tiers, textgrid_metadata_tiers)
-    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
     """, (
         1,
+        json.dumps(config['asr_configs']),
         config['class_balancing'],
         config['feature'],
         config['noise_mix_mode'],

sonusai 0.18.1__py3-none-any.whl → 0.18.4__py3-none-any.whl

sonusai 0.18.1py3-none-any.whl → 0.18.4py3-none-any.whl