PyPI - sonusai - Versions diffs - 1.0.16__cp311-abi3-macosx_10_12_x86_64.whl - Mend

sonusai 1.0.16__cp311-abi3-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sonusai/__init__.py +170 -0
sonusai/aawscd_probwrite.py +148 -0
sonusai/audiofe.py +481 -0
sonusai/calc_metric_spenh.py +1136 -0
sonusai/config/__init__.py +0 -0
sonusai/config/asr.py +21 -0
sonusai/config/config.py +65 -0
sonusai/config/config.yml +49 -0
sonusai/config/constants.py +53 -0
sonusai/config/ir.py +124 -0
sonusai/config/ir_delay.py +62 -0
sonusai/config/source.py +275 -0
sonusai/config/spectral_masks.py +15 -0
sonusai/config/truth.py +64 -0
sonusai/constants.py +14 -0
sonusai/data/__init__.py +0 -0
sonusai/data/silero_vad_v5.1.jit +0 -0
sonusai/data/silero_vad_v5.1.onnx +0 -0
sonusai/data/speech_ma01_01.wav +0 -0
sonusai/data/whitenoise.wav +0 -0
sonusai/datatypes.py +383 -0
sonusai/deprecated/gentcst.py +632 -0
sonusai/deprecated/plot.py +519 -0
sonusai/deprecated/tplot.py +365 -0
sonusai/doc.py +52 -0
sonusai/doc_strings/__init__.py +1 -0
sonusai/doc_strings/doc_strings.py +531 -0
sonusai/genft.py +196 -0
sonusai/genmetrics.py +183 -0
sonusai/genmix.py +199 -0
sonusai/genmixdb.py +235 -0
sonusai/ir_metric.py +551 -0
sonusai/lsdb.py +141 -0
sonusai/main.py +134 -0
sonusai/metrics/__init__.py +43 -0
sonusai/metrics/calc_audio_stats.py +42 -0
sonusai/metrics/calc_class_weights.py +90 -0
sonusai/metrics/calc_optimal_thresholds.py +73 -0
sonusai/metrics/calc_pcm.py +45 -0
sonusai/metrics/calc_pesq.py +36 -0
sonusai/metrics/calc_phase_distance.py +43 -0
sonusai/metrics/calc_sa_sdr.py +64 -0
sonusai/metrics/calc_sample_weights.py +25 -0
sonusai/metrics/calc_segsnr_f.py +82 -0
sonusai/metrics/calc_speech.py +382 -0
sonusai/metrics/calc_wer.py +71 -0
sonusai/metrics/calc_wsdr.py +57 -0
sonusai/metrics/calculate_metrics.py +395 -0
sonusai/metrics/class_summary.py +74 -0
sonusai/metrics/confusion_matrix_summary.py +75 -0
sonusai/metrics/one_hot.py +283 -0
sonusai/metrics/snr_summary.py +128 -0
sonusai/metrics_summary.py +314 -0
sonusai/mixture/__init__.py +15 -0
sonusai/mixture/audio.py +187 -0
sonusai/mixture/class_balancing.py +103 -0
sonusai/mixture/constants.py +3 -0
sonusai/mixture/data_io.py +173 -0
sonusai/mixture/db.py +169 -0
sonusai/mixture/db_datatypes.py +92 -0
sonusai/mixture/effects.py +344 -0
sonusai/mixture/feature.py +78 -0
sonusai/mixture/generation.py +1116 -0
sonusai/mixture/helpers.py +351 -0
sonusai/mixture/ir_effects.py +77 -0
sonusai/mixture/log_duration_and_sizes.py +23 -0
sonusai/mixture/mixdb.py +1857 -0
sonusai/mixture/pad_audio.py +35 -0
sonusai/mixture/resample.py +7 -0
sonusai/mixture/sox_effects.py +195 -0
sonusai/mixture/sox_help.py +650 -0
sonusai/mixture/spectral_mask.py +51 -0
sonusai/mixture/truth.py +61 -0
sonusai/mixture/truth_functions/__init__.py +45 -0
sonusai/mixture/truth_functions/crm.py +105 -0
sonusai/mixture/truth_functions/energy.py +222 -0
sonusai/mixture/truth_functions/file.py +48 -0
sonusai/mixture/truth_functions/metadata.py +24 -0
sonusai/mixture/truth_functions/metrics.py +28 -0
sonusai/mixture/truth_functions/phoneme.py +18 -0
sonusai/mixture/truth_functions/sed.py +98 -0
sonusai/mixture/truth_functions/target.py +142 -0
sonusai/mkwav.py +135 -0
sonusai/onnx_predict.py +363 -0
sonusai/parse/__init__.py +0 -0
sonusai/parse/expand.py +156 -0
sonusai/parse/parse_source_directive.py +129 -0
sonusai/parse/rand.py +214 -0
sonusai/py.typed +0 -0
sonusai/queries/__init__.py +0 -0
sonusai/queries/queries.py +239 -0
sonusai/rs.abi3.so +0 -0
sonusai/rs.pyi +1 -0
sonusai/rust/__init__.py +0 -0
sonusai/speech/__init__.py +0 -0
sonusai/speech/l2arctic.py +121 -0
sonusai/speech/librispeech.py +102 -0
sonusai/speech/mcgill.py +71 -0
sonusai/speech/textgrid.py +89 -0
sonusai/speech/timit.py +138 -0
sonusai/speech/types.py +12 -0
sonusai/speech/vctk.py +53 -0
sonusai/speech/voxceleb.py +108 -0
sonusai/utils/__init__.py +3 -0
sonusai/utils/asl_p56.py +130 -0
sonusai/utils/asr.py +91 -0
sonusai/utils/asr_functions/__init__.py +3 -0
sonusai/utils/asr_functions/aaware_whisper.py +69 -0
sonusai/utils/audio_devices.py +50 -0
sonusai/utils/braced_glob.py +50 -0
sonusai/utils/calculate_input_shape.py +26 -0
sonusai/utils/choice.py +51 -0
sonusai/utils/compress.py +25 -0
sonusai/utils/convert_string_to_number.py +6 -0
sonusai/utils/create_timestamp.py +5 -0
sonusai/utils/create_ts_name.py +14 -0
sonusai/utils/dataclass_from_dict.py +27 -0
sonusai/utils/db.py +16 -0
sonusai/utils/docstring.py +53 -0
sonusai/utils/energy_f.py +44 -0
sonusai/utils/engineering_number.py +166 -0
sonusai/utils/evaluate_random_rule.py +15 -0
sonusai/utils/get_frames_per_batch.py +2 -0
sonusai/utils/get_label_names.py +20 -0
sonusai/utils/grouper.py +6 -0
sonusai/utils/human_readable_size.py +7 -0
sonusai/utils/keyboard_interrupt.py +12 -0
sonusai/utils/load_object.py +21 -0
sonusai/utils/max_text_width.py +9 -0
sonusai/utils/model_utils.py +28 -0
sonusai/utils/numeric_conversion.py +11 -0
sonusai/utils/onnx_utils.py +155 -0
sonusai/utils/parallel.py +162 -0
sonusai/utils/path_info.py +7 -0
sonusai/utils/print_mixture_details.py +60 -0
sonusai/utils/rand.py +13 -0
sonusai/utils/ranges.py +43 -0
sonusai/utils/read_predict_data.py +32 -0
sonusai/utils/reshape.py +154 -0
sonusai/utils/seconds_to_hms.py +7 -0
sonusai/utils/stacked_complex.py +82 -0
sonusai/utils/stratified_shuffle_split.py +170 -0
sonusai/utils/tokenized_shell_vars.py +143 -0
sonusai/utils/write_audio.py +26 -0
sonusai/utils/yes_or_no.py +8 -0
sonusai/vars.py +47 -0
sonusai-1.0.16.dist-info/METADATA +56 -0
sonusai-1.0.16.dist-info/RECORD +150 -0
sonusai-1.0.16.dist-info/WHEEL +4 -0
sonusai-1.0.16.dist-info/entry_points.txt +3 -0

sonusai/metrics/calc_speech.py ADDED Viewed

@@ -0,0 +1,382 @@
+import numpy as np
+from ..constants import SAMPLE_RATE
+from ..datatypes import SpeechMetrics
+from .calc_pesq import calc_pesq
+def calc_speech(
+    hypothesis: np.ndarray,
+    reference: np.ndarray,
+    pesq: float | None = None,
+    sample_rate: int = SAMPLE_RATE,
+) -> SpeechMetrics:
+    """Calculate speech metrics c_sig, c_bak, and c_ovl.
+    These are all related and thus included in one function. Reference: matlab script "compute_metrics.m".
+    :param hypothesis: estimated audio
+    :param reference: reference audio
+    :param pesq: pesq
+    :param sample_rate: sample rate of audio
+    :return: SpeechMetrics named tuple
+    """
+    # Weighted spectral slope measure
+    wss_dist_vec = _calc_weighted_spectral_slope_measure(hypothesis=hypothesis, reference=reference)
+    wss_dist_vec = np.sort(wss_dist_vec)
+    # Value from CMGAN reference implementation
+    alpha = 0.95
+    wss_dist = np.mean(wss_dist_vec[0 : round(np.size(wss_dist_vec) * alpha)])
+    # Log likelihood ratio measure
+    llr_dist = _calc_log_likelihood_ratio_measure(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
+    ll_rs = np.sort(llr_dist)
+    llr_len = round(np.size(llr_dist) * alpha)
+    llr_mean = np.mean(ll_rs[:llr_len])
+    # Segmental SNR
+    _, segsnr_dist = _calc_snr(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
+    seg_snr = np.mean(segsnr_dist)
+    # PESQ
+    if pesq is None:
+        pesq = calc_pesq(hypothesis=hypothesis, reference=reference, sample_rate=sample_rate)
+    # Now compute the composite measures
+    csig = float(np.clip(3.093 - 1.029 * llr_mean + 0.603 * pesq - 0.009 * wss_dist, 1, 5))
+    cbak = float(np.clip(1.634 + 0.478 * pesq - 0.007 * wss_dist + 0.063 * seg_snr, 1, 5))
+    covl = float(np.clip(1.594 + 0.805 * pesq - 0.512 * llr_mean - 0.007 * wss_dist, 1, 5))
+    return SpeechMetrics(csig, cbak, covl)
+def _calc_weighted_spectral_slope_measure(
+    hypothesis: np.ndarray,
+    reference: np.ndarray,
+    sample_rate: int = SAMPLE_RATE,
+) -> np.ndarray:
+    from scipy.fftpack import fft
+    # The lengths of the reference and hypothesis must be the same.
+    reference_length = np.size(reference)
+    hypothesis_length = np.size(hypothesis)
+    if reference_length != hypothesis_length:
+        raise ValueError("Hypothesis and reference must be the same length.")
+    # Window length in samples
+    win_length = int(np.round(30 * sample_rate / 1000))
+    # Window skip in samples
+    skip_rate = int(np.floor(np.divide(win_length, 4)))
+    # Maximum bandwidth
+    max_freq = int(np.divide(sample_rate, 2))
+    num_crit = 25
+    n_fft = int(np.power(2, np.ceil(np.log2(2 * win_length))))
+    n_fft_by_2 = int(np.multiply(0.5, n_fft))
+    # Value suggested by Klatt, pg 1280
+    k_max = 20.0
+    # Value suggested by Klatt, pg 1280
+    k_loc_max = 1.0
+    # Critical band filter definitions (center frequency and bandwidths in Hz)
+    cent_freq = np.array(
+        [
+            50.0000,
+            120.000,
+            190.000,
+            260.000,
+            330.000,
+            400.000,
+            470.000,
+            540.000,
+            617.372,
+            703.378,
+            798.717,
+            904.128,
+            1020.38,
+            1148.30,
+            1288.72,
+            1442.54,
+            1610.70,
+            1794.16,
+            1993.93,
+            2211.08,
+            2446.71,
+            2701.97,
+            2978.04,
+            3276.17,
+            3597.63,
+        ]
+    )
+    bandwidth = np.array(
+        [
+            70.0000,
+            70.0000,
+            70.0000,
+            70.0000,
+            70.0000,
+            70.0000,
+            70.0000,
+            77.3724,
+            86.0056,
+            95.3398,
+            105.411,
+            116.256,
+            127.914,
+            140.423,
+            153.823,
+            168.154,
+            183.457,
+            199.776,
+            217.153,
+            235.631,
+            255.255,
+            276.072,
+            298.126,
+            321.465,
+            346.136,
+        ]
+    )
+    # Minimum critical bandwidth
+    bw_min = bandwidth[0]
+    # Set up the critical band filters.
+    # Note here that Gaussian-ly shaped filters are used.
+    # Also, the sum of the filter weights are equivalent for each critical band filter.
+    # Filter less than -30 dB and set to zero.
+    # -30 dB point of filter
+    min_factor = np.exp(-30.0 / (2.0 * 2.303))
+    crit_filter = np.empty((num_crit, n_fft_by_2))
+    for i in range(num_crit):
+        f0 = (cent_freq[i] / max_freq) * n_fft_by_2
+        bw = (bandwidth[i] / max_freq) * n_fft_by_2
+        norm_factor = np.log(bw_min) - np.log(bandwidth[i])
+        j = np.arange(n_fft_by_2)
+        crit_filter[i, :] = np.exp(-11 * np.square(np.divide(j - np.floor(f0), bw)) + norm_factor)
+        cond = np.greater(crit_filter[i, :], min_factor)
+        crit_filter[i, :] = np.where(cond, crit_filter[i, :], 0)
+    # For each frame of input speech, calculate the weighted spectral slope measure
+    num_frames = int(reference_length / skip_rate - (win_length / skip_rate))
+    start = 0
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
+    distortion = np.empty(num_frames)
+    for frame_count in range(num_frames):
+        # (1) Get the frames for the test and reference speech. Multiply by Hanning window.
+        reference_frame = reference[start : start + win_length] / 32768
+        hypothesis_frame = hypothesis[start : start + win_length] / 32768
+        reference_frame = np.multiply(reference_frame, window)
+        hypothesis_frame = np.multiply(hypothesis_frame, window)
+        # (2) Compute the power spectrum of reference and hypothesis
+        reference_spec = np.square(np.abs(fft(reference_frame, n_fft)))
+        hypothesis_spec = np.square(np.abs(fft(hypothesis_frame, n_fft)))
+        # (3) Compute filter bank output energies (in dB scale)
+        reference_energy = np.matmul(crit_filter, reference_spec[0:n_fft_by_2])
+        hypothesis_energy = np.matmul(crit_filter, hypothesis_spec[0:n_fft_by_2])
+        reference_energy = 10 * np.log10(np.maximum(reference_energy, 1e-10))
+        hypothesis_energy = 10 * np.log10(np.maximum(hypothesis_energy, 1e-10))
+        # (4) Compute spectral slope (dB[i+1]-dB[i])
+        reference_slope = reference_energy[1:num_crit] - reference_energy[0 : num_crit - 1]
+        hypothesis_slope = hypothesis_energy[1:num_crit] - hypothesis_energy[0 : num_crit - 1]
+        # (5) Find the nearest peak locations in the spectra to each critical band.
+        #     If the slope is negative, we search to the left. If positive, we search to the right.
+        reference_loc_peak = np.empty(num_crit - 1)
+        hypothesis_loc_peak = np.empty(num_crit - 1)
+        for i in range(num_crit - 1):
+            # find the peaks in the reference speech signal
+            if reference_slope[i] > 0:
+                # search to the right
+                n = i
+                while (n < num_crit - 1) and (reference_slope[n] > 0):
+                    n = n + 1
+                reference_loc_peak[i] = reference_energy[n - 1]
+            else:
+                # search to the left
+                n = i
+                while (n >= 0) and (reference_slope[n] <= 0):
+                    n = n - 1
+                reference_loc_peak[i] = reference_energy[n + 1]
+            # find the peaks in the hypothesis speech signal
+            if hypothesis_slope[i] > 0:
+                # search to the right
+                n = i
+                while (n < num_crit - 1) and (hypothesis_slope[n] > 0):
+                    n = n + 1
+                hypothesis_loc_peak[i] = hypothesis_energy[n - 1]
+            else:
+                # search to the left
+                n = i
+                while (n >= 0) and (hypothesis_slope[n] <= 0):
+                    n = n - 1
+                hypothesis_loc_peak[i] = hypothesis_energy[n + 1]
+        # (6) Compute the weighted spectral slope measure for this frame.
+        #     This includes determination of the weighting function.
+        db_max_reference = np.max(reference_energy)
+        db_max_hypothesis = np.max(hypothesis_energy)
+        # The weights are calculated by averaging individual weighting factors from the reference and hypothesis frame.
+        # These weights w_reference and w_hypothesis should range from 0 to 1 and place more emphasis on spectral peaks
+        # and less emphasis on slope differences in spectral valleys.
+        # This procedure is described on page 1280 of Klatt's 1982 ICASSP paper.
+        w_max_reference = np.divide(k_max, k_max + db_max_reference - reference_energy[0 : num_crit - 1])
+        w_loc_max_reference = np.divide(
+            k_loc_max,
+            k_loc_max + reference_loc_peak - reference_energy[0 : num_crit - 1],
+        )
+        w_reference = np.multiply(w_max_reference, w_loc_max_reference)
+        w_max_hypothesis = np.divide(k_max, k_max + db_max_hypothesis - hypothesis_energy[0 : num_crit - 1])
+        w_loc_max_hypothesis = np.divide(
+            k_loc_max,
+            k_loc_max + hypothesis_loc_peak - hypothesis_energy[0 : num_crit - 1],
+        )
+        w_hypothesis = np.multiply(w_max_hypothesis, w_loc_max_hypothesis)
+        w = np.divide(np.add(w_reference, w_hypothesis), 2.0)
+        slope_diff = np.subtract(reference_slope, hypothesis_slope)[0 : num_crit - 1]
+        distortion[frame_count] = np.dot(w, np.square(slope_diff)) / np.sum(w)
+        # This normalization is not part of Klatt's paper, but helps to normalize the measure.
+        # Here we scale the measure by the sum of the weights.
+        start = start + skip_rate
+    return distortion
+def _calc_log_likelihood_ratio_measure(
+    hypothesis: np.ndarray,
+    reference: np.ndarray,
+    sample_rate: int = SAMPLE_RATE,
+) -> np.ndarray:
+    from scipy.linalg import toeplitz
+    # The lengths of the reference and hypothesis must be the same.
+    reference_length = np.size(reference)
+    hypothesis_length = np.size(hypothesis)
+    if reference_length != hypothesis_length:
+        raise ValueError("Hypothesis and reference must be the same length.")
+    # window length in samples
+    win_length = int(np.round(30 * sample_rate / 1000))
+    # window skip in samples
+    skip_rate = int(np.floor(win_length / 4))
+    # LPC analysis order; this could vary depending on sampling frequency.
+    if sample_rate < 10000:
+        p = 10
+    else:
+        p = 16
+    # For each frame of input speech, calculate the log likelihood ratio
+    num_frames = int((reference_length - win_length) / skip_rate)
+    start = 0
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
+    distortion = np.empty(num_frames)
+    for frame_count in range(num_frames):
+        # (1) Get the frames for the test and reference speech. Multiply by Hanning window.
+        reference_frame = reference[start : start + win_length]
+        hypothesis_frame = hypothesis[start : start + win_length]
+        reference_frame = np.multiply(reference_frame, window)
+        hypothesis_frame = np.multiply(hypothesis_frame, window)
+        # (2) Get the autocorrelation lags and LPC parameters used to compute the log likelihood ratio measure.
+        r_reference, _, a_reference = _lp_coefficients(reference_frame, p)
+        _, _, a_hypothesis = _lp_coefficients(hypothesis_frame, p)
+        # (3) Compute the log likelihood ratio measure
+        numerator = np.dot(np.matmul(a_hypothesis, toeplitz(r_reference)), a_hypothesis)
+        denominator = np.dot(np.matmul(a_reference, toeplitz(r_reference)), a_reference)
+        distortion[frame_count] = np.log(numerator / denominator)
+        start = start + skip_rate
+    return distortion
+def _calc_snr(
+    hypothesis: np.ndarray,
+    reference: np.ndarray,
+    sample_rate: int = SAMPLE_RATE,
+) -> tuple[float, np.ndarray]:
+    # The lengths of the reference and hypothesis must be the same.
+    reference_length = len(reference)
+    hypothesis_length = len(hypothesis)
+    if reference_length != hypothesis_length:
+        raise ValueError("Hypothesis and reference must be the same length.")
+    overall_snr = 10 * np.log10(
+        np.sum(np.square(reference)) / (np.sum(np.square(reference - hypothesis))) + np.finfo(np.float32).eps
+    )
+    # window length in samples
+    win_length = round(30 * sample_rate / 1000)
+    # window skip in samples
+    skip_rate = int(np.floor(win_length / 4))
+    # minimum SNR in dB
+    min_snr = -10
+    # maximum SNR in dB
+    max_snr = 35
+    # For each frame of input speech, calculate the segmental SNR
+    num_frames = int(reference_length / skip_rate - (win_length / skip_rate))
+    start = 0
+    window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(1, win_length + 1) / (win_length + 1)))
+    segmental_snr = np.empty(num_frames)
+    eps = np.spacing(1)
+    for frame_count in range(num_frames):
+        # (1) Get the frames for the test and reference speech. Multiply by Hanning window.
+        reference_frame = reference[start : start + win_length]
+        hypothesis_frame = hypothesis[start : start + win_length]
+        reference_frame = np.multiply(reference_frame, window)
+        hypothesis_frame = np.multiply(hypothesis_frame, window)
+        # (2) Compute the segmental SNR
+        signal_energy = np.sum(np.square(reference_frame))
+        noise_energy = np.sum(np.square(reference_frame - hypothesis_frame))
+        segmental_snr[frame_count] = np.clip(
+            10 * np.log10(signal_energy / (noise_energy + eps) + eps), min_snr, max_snr
+        )
+        start = start + skip_rate
+    return overall_snr, segmental_snr
+def _lp_coefficients(speech_frame, model_order):
+    # (1) Compute autocorrelation lags
+    win_length = np.size(speech_frame)
+    autocorrelation = np.empty(model_order + 1)
+    e = np.empty(model_order + 1)
+    for k in range(model_order + 1):
+        autocorrelation[k] = np.dot(speech_frame[0 : win_length - k], speech_frame[k:win_length])
+    # (2) Levinson-Durbin
+    a = np.ones(model_order)
+    a_past = np.empty(model_order)
+    ref_coefficients = np.empty(model_order)
+    e[0] = autocorrelation[0]
+    for i in range(model_order):
+        a_past[0:i] = a[0:i]
+        sum_term = np.dot(a_past[0:i], autocorrelation[i:0:-1])
+        ref_coefficients[i] = (autocorrelation[i + 1] - sum_term) / e[i]
+        a[i] = ref_coefficients[i]
+        if i == 0:
+            a[0:i] = a_past[0:i] - np.multiply(a_past[i - 1 : -1 : -1], ref_coefficients[i])
+        else:
+            a[0:i] = a_past[0:i] - np.multiply(a_past[i - 1 :: -1], ref_coefficients[i])
+        e[i + 1] = (1 - ref_coefficients[i] * ref_coefficients[i]) * e[i]
+    lp_params = np.concatenate((np.array([1]), -a))
+    return autocorrelation, ref_coefficients, lp_params

sonusai/metrics/calc_wer.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import NamedTuple
+class WerResult(NamedTuple):
+    wer: float
+    words: int
+    substitutions: float
+    deletions: float
+    insertions: float
+def calc_wer(hypothesis: list[str] | str, reference: list[str] | str) -> WerResult:
+    """Computes average word error rate between two texts represented as corresponding strings or lists of strings.
+    :param hypothesis: the hypothesis sentence(s) as a string or list of strings
+    :param reference: the reference sentence(s) as a string or list of strings
+    :return: a WerResult object with error, words, insertions, deletions, substitutions
+    """
+    import jiwer
+    transformation = jiwer.Compose(
+        [
+            jiwer.ToLowerCase(),
+            jiwer.RemovePunctuation(),
+            jiwer.RemoveWhiteSpace(replace_by_space=True),
+            jiwer.RemoveMultipleSpaces(),
+            jiwer.Strip(),
+            jiwer.RemoveEmptyStrings(),
+            jiwer.ReduceToListOfListOfWords(word_delimiter=" "),
+        ]
+    )
+    if isinstance(reference, str):
+        reference = [reference]
+    if isinstance(hypothesis, str):
+        hypothesis = [hypothesis]
+    # jiwer does not allow empty string
+    measures = {"insertions": 0, "substitutions": 0, "deletions": 0, "hits": 0}
+    if any(len(t) == 0 for t in reference):
+        if any(len(t) != 0 for t in hypothesis):
+            measures["insertions"] = len(hypothesis)
+    else:
+        measures = jiwer.compute_measures(
+            truth=reference,
+            hypothesis=hypothesis,
+            truth_transform=transformation,
+            hypothesis_transform=transformation,
+        )
+    errors = measures["substitutions"] + measures["deletions"] + measures["insertions"]
+    words = measures["hits"] + measures["substitutions"] + measures["deletions"]
+    if words != 0:
+        wer = errors / words
+        substitutions_rate = measures["substitutions"] / words
+        deletions_rate = measures["deletions"] / words
+        insertions_rate = measures["insertions"] / words
+    else:
+        wer = float("inf")
+        substitutions_rate = float("inf")
+        deletions_rate = float("inf")
+        insertions_rate = float("inf")
+    return WerResult(
+        wer=wer,
+        words=int(words),
+        substitutions=substitutions_rate,
+        deletions=deletions_rate,
+        insertions=insertions_rate,
+    )

sonusai/metrics/calc_wsdr.py ADDED Viewed

@@ -0,0 +1,57 @@
+import numpy as np
+def calc_wsdr(
+    hypothesis: np.ndarray,
+    reference: np.ndarray,
+    with_log: bool = False,
+    with_negate: bool = False,
+) -> tuple[float, np.ndarray, np.ndarray]:
+    """Calculate weighted SDR (signal distortion ratio) using all source inputs of size [samples, nsrc].
+       Uses true reference energy ratios to weight each cross-correlation coefficient cc = <y,yˆ>/∥y∥∥yˆ∥
+       in a sum over all sources.
+       range is -1 --> 1 as correlation/estimation improves or with_log -3db --> 70db (1e7 max)
+       if with_negate, range is 1 --> -1 as correlation improves and with_log range 3db --> -70db (1e-7 min)
+       Returns:  wsdr      scalar weighted signal-distortion ratio
+                 ccoef     nsrc vector of cross correlation coefficients
+                 cweights  nsrc vector of reference energy ratio weights
+    Reference:
+        WSDR: 2019-ICLR-dcunet-phase-aware-speech-enh
+    :param hypothesis: [samples, nsrc]
+    :param reference: [samples, nsrc]
+    :param with_log: enable scaling (return 10*log10)
+    :param with_negate: enable negation (for use as a loss function)
+    :return: (wsdr, ccoef, cweights)
+    """
+    nsrc = reference.shape[-1]
+    if hypothesis.shape[-1] != nsrc:
+        raise ValueError("hypothesis has wrong shape")
+    # Calculate cc = <y,yˆ>/∥y∥∥yˆ∥ always in range -1 --> 1, size [1,nsrc]
+    ref_e = np.sum(reference**2, axis=0, keepdims=True)  # [1,nsrc]
+    hy_e = np.sum(hypothesis**2, axis=0, keepdims=True)
+    allref_e = np.sum(ref_e)
+    cc = np.zeros(nsrc)  # calc correlation coefficient
+    cw = np.zeros(nsrc)  # cc weights (energy ratio)
+    for i in range(nsrc):
+        denom = np.sqrt(ref_e[0, i]) * np.sqrt(hy_e[0, i]) + 1e-7
+        cc[i] = np.sum(reference[:, i] * hypothesis[:, i], axis=0, keepdims=True) / denom
+        cw[i] = ref_e[0, i] / (allref_e + 1e-7)
+    # Note: tests show cw sums to 1.0 (+/- 7 digits), so just use cw for weighted sum
+    if with_negate:  # for use as a loss function
+        wsdr = float(np.sum(cw * -cc))  # cc always in range 1 --> -1
+        if with_log:
+            wsdr = max(wsdr, -1.0)
+            wsdr = 10 * np.log10(wsdr + 1 + 1e-7)  # range 3 --> -inf (or 1e-7 limit of -70db)
+    else:
+        wsdr = float(np.sum(cw * cc))  # cc always in range -1 --> 1
+        if with_log:
+            wsdr = min(wsdr, 1.0)  # (np.sum(cw * cc) needs sat ==1.0 for log)
+            wsdr = 10 * np.log10(-1 / (wsdr - 1 - 1e-7))  # range -3 --> inf (or 1e-7 limit of 70db)
+    return float(wsdr), cc, cw