PyPI - sonusai - Versions diffs - 0.18.2__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

sonusai 0.18.2py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sonusai/__init__.py +1 -0
sonusai/audiofe.py +1 -1
sonusai/calc_metric_spenh.py +32 -362
sonusai/data/genmixdb.yml +2 -0
sonusai/doc/doc.py +45 -4
sonusai/genmetrics.py +137 -109
sonusai/lsdb.py +2 -2
sonusai/metrics/__init__.py +4 -0
sonusai/metrics/calc_audio_stats.py +42 -0
sonusai/metrics/calc_pesq.py +12 -8
sonusai/metrics/calc_phase_distance.py +43 -0
sonusai/metrics/calc_snr_f.py +34 -0
sonusai/metrics/calc_speech.py +312 -0
sonusai/metrics/calc_wer.py +2 -3
sonusai/metrics/calc_wsdr.py +0 -59
sonusai/mixture/__init__.py +3 -2
sonusai/mixture/audio.py +6 -5
sonusai/mixture/config.py +13 -0
sonusai/mixture/constants.py +1 -0
sonusai/mixture/datatypes.py +33 -0
sonusai/mixture/generation.py +6 -2
sonusai/mixture/mixdb.py +261 -122
sonusai/mixture/soundfile_audio.py +8 -6
sonusai/mixture/sox_audio.py +16 -13
sonusai/mixture/torchaudio_audio.py +6 -4
sonusai/mixture/truth_functions/energy.py +40 -28
sonusai/mixture/truth_functions/target.py +0 -1
sonusai/utils/__init__.py +1 -1
sonusai/utils/asr.py +26 -39
sonusai/utils/asr_functions/aaware_whisper.py +3 -3
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/METADATA +1 -1
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/RECORD +34 -31
sonusai/mixture/mapped_snr_f.py +0 -100
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/WHEEL +0 -0
{sonusai-0.18.2.dist-info → sonusai-0.18.4.dist-info}/entry_points.txt +0 -0

sonusai/mixture/mixdb.py CHANGED Viewed

@@ -6,6 +6,7 @@ from sqlite3 import Cursor
 from typing import Any
 from typing import Optional
+from sonusai.mixture.datatypes import ASRConfigs
 from sonusai.mixture.datatypes import AudioF
 from sonusai.mixture.datatypes import AudioT
 from sonusai.mixture.datatypes import AudiosF
@@ -88,6 +89,7 @@ class MixtureDatabase:
         from .datatypes import MixtureDatabaseConfig
         config = MixtureDatabaseConfig(
+            asr_configs=self.asr_configs,
             class_balancing=self.class_balancing,
             class_labels=self.class_labels,
             class_weights_threshold=self.class_weights_thresholds,
@@ -145,6 +147,30 @@ class MixtureDatabase:
         with self.db() as c:
             return str(c.execute("SELECT top.noise_mix_mode FROM top").fetchone()[0])
+    @cached_property
+    def asr_configs(self) -> ASRConfigs:
+        import json
+        with self.db() as c:
+            return json.loads(c.execute("SELECT top.asr_configs FROM top").fetchone()[0])
+    @cached_property
+    def supported_metrics(self) -> set[str]:
+        metrics = {
+            'mxssnravg', 'mxssnrvar', 'mxssnrdavg', 'mxssnrdstd',
+            'mxpesq', 'mxcsig', 'mxcbak', 'mxcovl', 'mxwsdr',
+            'mxpd',
+            'mxstoi',
+            'tdco', 'tmin', 'tmax', 'tpkdb', 'tlrms', 'tpkr', 'ttr', 'tcr', 'tfl', 'tpkc',
+            'ndco', 'nmin', 'nmax', 'npkdb', 'nlrms', 'npkr', 'ntr', 'ncr', 'nfl', 'npkc',
+            'sedavg', 'sedcnt', 'sedtopn',
+            'ssnr',
+        }
+        for name in self.asr_configs:
+            metrics.add(f'mxwer.{name}')
+        return metrics
     @cached_property
     def class_balancing(self) -> bool:
         with self.db() as c:
@@ -1108,173 +1134,286 @@ class MixtureDatabase:
         return mixture_all_speech_metadata(self, self.mixture(m_id))
-    def mixture_metric(self, m_id: int, metric: str, force: bool = False) -> Any:
-        """Get metric data for the given mixture ID
+    def mixture_metrics(self, m_id: int,
+                        metrics: list[str],
+                        force: bool = False) -> list[float | int | Segsnr]:
+        """Get metrics data for the given mixture ID
         :param m_id: Zero-based mixture ID
-        :param metric: Metric data to retrieve
+        :param metrics: List of metrics to get
         :param force: Force computing data from original sources regardless of whether cached data exists
-        :return: Metric data
+        :return: List of metric data
         """
+        from typing import Callable
+        import numpy as np
+        from pystoi import stoi
         from sonusai import SonusAIError
+        from sonusai.metrics import calc_audio_stats
+        from sonusai.metrics import calc_phase_distance
+        from sonusai.metrics import calc_snr_f
+        from sonusai.metrics import calc_speech
+        from sonusai.metrics import calc_wer
+        from sonusai.metrics import calc_wsdr
+        from sonusai.mixture import SAMPLE_RATE
+        from sonusai.mixture import AudioStatsMetrics
+        from sonusai.mixture import SpeechMetrics
+        from sonusai.utils import calc_asr
-        supported_metrics = (
-            'MXSNR',
-            'MXSSNRAVG',
-            'MXSSNRSTD',
-            'MXSSNRDAVG',
-            'MXSSNRDSTD',
-            'MXPESQ',
-            'MXWSDR',
-            'MXPD',
-            'MXSTOI',
-            'MXCSIG',
-            'MXCBAK',
-            'MXCOVL',
-            'TDCO',
-            'TMIN',
-            'TMAX',
-            'TPKDB',
-            'TLRMS',
-            'TPKR',
-            'TTR',
-            'TCR',
-            'TFL',
-            'TPKC',
-            'NDCO',
-            'NMIN',
-            'NMAX',
-            'NPKDB',
-            'NLRMS',
-            'NPKR',
-            'NTR',
-            'NCR',
-            'NFL',
-            'NPKC',
-            'SEDAVG',
-            'SEDCNT',
-            'SEDTOPN',
-        )
+        def create_target_audio() -> Callable:
+            state = None
-        if not (metric in supported_metrics or metric.startswith('MXWER')):
-            raise SonusAIError(f'Unsupported metric: {metric}')
+            def get() -> np.ndarray:
+                nonlocal state
+                if state is None:
+                    state = self.mixture_target(m_id)
+                return state
-        if not force:
-            result = self.read_mixture_data(m_id, metric)
-            if result is not None:
-                return result
+            return get
-        mixture = self.mixture(m_id)
-        if mixture is None:
-            raise SonusAIError(f'Could not find mixture for m_id: {m_id}')
+        target_audio = create_target_audio()
-        if metric.startswith('MXWER'):
-            return None
+        def create_noise_audio() -> Callable:
+            state = None
-        if metric == 'MXSNR':
-            return self.snrs
+            def get() -> np.ndarray:
+                nonlocal state
+                if state is None:
+                    state = self.mixture_noise(m_id)
+                return state
-        if metric == 'MXSSNRAVG':
-            return None
+            return get
-        if metric == 'MXSSNRSTD':
-            return None
+        noise_audio = create_noise_audio()
-        if metric == 'MXSSNRDAVG':
-            return None
+        def create_mixture_audio() -> Callable:
+            state = None
-        if metric == 'MXSSNRDSTD':
-            return None
+            def get() -> np.ndarray:
+                nonlocal state
+                if state is None:
+                    state = self.mixture_mixture(m_id)
+                return state
-        if metric == 'MXPESQ':
-            return None
+            return get
-        if metric == 'MXWSDR':
-            return None
+        mixture_audio = create_mixture_audio()
-        if metric == 'MXPD':
-            return None
+        def create_segsnr_f() -> Callable:
+            state = None
-        if metric == 'MXSTOI':
-            return None
+            def get() -> np.ndarray:
+                nonlocal state
+                if state is None:
+                    state = self.mixture_segsnr(m_id)
+                return state
-        if metric == 'MXCSIG':
-            return None
+            return get
-        if metric == 'MXCBAK':
-            return None
+        segsnr_f = create_segsnr_f()
-        if metric == 'MXCOVL':
-            return None
+        def create_speech() -> Callable:
+            state = None
-        if metric == 'TDCO':
-            return None
+            def get() -> SpeechMetrics:
+                nonlocal state
+                if state is None:
+                    state = calc_speech(hypothesis=mixture_audio(), reference=target_audio())
+                return state
-        if metric == 'TMIN':
-            return None
+            return get
-        if metric == 'TMAX':
-            return None
+        speech = create_speech()
-        if metric == 'TPKDB':
-            return None
+        def create_target_stats() -> Callable:
+            state = None
-        if metric == 'TLRMS':
-            return None
+            def get() -> AudioStatsMetrics:
+                nonlocal state
+                if state is None:
+                    state = calc_audio_stats(target_audio(), self.fg_info.ft_config.N / SAMPLE_RATE)
+                return state
-        if metric == 'TPKR':
-            return None
+            return get
-        if metric == 'TTR':
-            return None
+        target_stats = create_target_stats()
-        if metric == 'TCR':
-            return None
+        def create_noise_stats() -> Callable:
+            state = None
-        if metric == 'TFL':
-            return None
+            def get() -> AudioStatsMetrics:
+                nonlocal state
+                if state is None:
+                    state = calc_audio_stats(noise_audio(), self.fg_info.ft_config.N / SAMPLE_RATE)
+                return state
-        if metric == 'TPKC':
-            return None
+            return get
-        if metric == 'NDCO':
-            return None
+        noise_stats = create_noise_stats()
-        if metric == 'NMIN':
-            return None
+        def calc(m: str) -> float | int | Segsnr:
+            if m == 'mxsnr':
+                return self.mixture(m_id).snr
-        if metric == 'NMAX':
-            return None
+            # Get cached data first, if exists
+            if not force:
+                value = self.read_mixture_data(m_id, m)
+                if value is not None:
+                    return value
-        if metric == 'NPKDB':
-            return None
+            # Otherwise, generate data as needed
+            if m.startswith('mxwer'):
+                parts = m.split('.')
+                if len(parts) != 3:
+                    raise SonusAIError(
+                        f"Unrecognized 'mwwer' format: '{m}'; must be of the form: 'mxwer.<engine>.<model>'")
+                asr_engine = parts[1]
+                asr_model = parts[2]
-        if metric == 'NLRMS':
-            return None
+                if asr_engine == 'none' or self.mixture(m_id).snr < -96:
+                    # noise only, ignore/reset target asr
+                    return float('nan')
-        if metric == 'NPKR':
-            return None
+                # ignore mixup
+                target_asr = self.mixture_speech_metadata(m_id, 'text')[0]
+                if target_asr is None:
+                    target_asr = calc_asr(target_audio(), engine=asr_engine, whisper_model_name=asr_model).text
-        if metric == 'NTR':
-            return None
+                if target_asr:
+                    mixture_asr = calc_asr(mixture_audio(), engine=asr_engine, whisper_model_name=asr_model).text
+                    return calc_wer(mixture_asr, target_asr).wer * 100
-        if metric == 'NCR':
-            return None
+                # TODO: should this be NaN like above?
+                return float(0)
-        if metric == 'NFL':
-            return None
+            if m == 'mxssnravg':
+                return calc_snr_f(segsnr_f()).mean
-        if metric == 'NPKC':
-            return None
+            if m == 'mxssnrvar':
+                return calc_snr_f(segsnr_f()).var
-        if metric == 'SEDAVG':
-            return None
+            if m == 'mxssnrdavg':
+                return calc_snr_f(segsnr_f()).db_mean
-        if metric == 'SEDCNT':
-            return None
+            if m == 'mxssnrdstd':
+                return calc_snr_f(segsnr_f()).db_std
-        if metric == 'SEDTOPN':
-            return None
+            if m == 'mxpesq':
+                if self.mixture(m_id).snr < -96:
+                    return 0
+                return speech().pesq
+            if m == 'mxcsig':
+                if self.mixture(m_id).snr < -96:
+                    return 0
+                return speech().c_sig
+            if m == 'mxcbak':
+                if self.mixture(m_id).snr < -96:
+                    return 0
+                return speech().c_bak
+            if m == 'mxcovl':
+                if self.mixture(m_id).snr < -96:
+                    return 0
+                return speech().c_ovl
+            if m == 'mxwsdr':
+                mixture = mixture_audio()[:, np.newaxis]
+                target = target_audio()[:, np.newaxis]
+                noise = noise_audio()[:, np.newaxis]
+                return calc_wsdr(hypothesis=np.concatenate((mixture, noise), axis=1),
+                                 reference=np.concatenate((target, noise), axis=1),
+                                 with_log=True)[0]
+            if m == 'mxpd':
+                mixture_f = self.mixture_mixture_f(m_id)
+                target_f = self.mixture_target_f(m_id)
+                return calc_phase_distance(hypothesis=mixture_f, reference=target_f)[0]
+            if m == 'mxstoi':
+                return stoi(x=target_audio(), y=mixture_audio(), fs_sig=SAMPLE_RATE, extended=False)
+            if m == 'tdco':
+                return target_stats().dco
+            if m == 'tmin':
+                return target_stats().min
+            if m == 'tmax':
+                return target_stats().max
+            if m == 'tpkdb':
+                return target_stats().pkdb
+            if m == 'tlrms':
+                return target_stats().lrms
+            if m == 'tpkr':
+                return target_stats().pkr
+            if m == 'ttr':
+                return target_stats().tr
+            if m == 'tcr':
+                return target_stats().cr
+            if m == 'tfl':
+                return target_stats().fl
+            if m == 'tpkc':
+                return target_stats().pkc
+            if m == 'ndco':
+                return noise_stats().dco
+            if m == 'nmin':
+                return noise_stats().min
+            if m == 'nmax':
+                return noise_stats().max
+            if m == 'npkdb':
+                return noise_stats().pkdb
+            if m == 'nlrms':
+                return noise_stats().lrms
+            if m == 'npkr':
+                return noise_stats().pkr
+            if m == 'ntr':
+                return noise_stats().tr
+            if m == 'ncr':
+                return noise_stats().cr
+            if m == 'nfl':
+                return noise_stats().fl
+            if m == 'npkc':
+                return noise_stats().pkc
+            if m == 'sedavg':
+                return 0
+            if m == 'sedcnt':
+                return 0
+            if m == 'sedtopn':
+                return 0
+            if m == 'ssnr':
+                return self.mixture_segsnr(m_id)
+            raise SonusAIError(f"Unrecognized metric: '{m}'")
+        result: list[float | int | Segsnr] = []
+        for metric in metrics:
+            result.append(calc(metric))
+        return result
 @lru_cache

sonusai/mixture/soundfile_audio.py CHANGED Viewed

@@ -1,8 +1,10 @@
+from pathlib import Path
 from sonusai.mixture.datatypes import AudioT
 from sonusai.mixture.datatypes import ImpulseResponseData
-def _raw_read(name: str) -> tuple[AudioT, int]:
+def _raw_read(name: str | Path) -> tuple[AudioT, int]:
     import numpy as np
     import soundfile
     from pydub import AudioSegment
@@ -34,7 +36,7 @@ def _raw_read(name: str) -> tuple[AudioT, int]:
     return np.squeeze(raw[:, 0]), sample_rate
-def get_sample_rate(name: str) -> int:
+def get_sample_rate(name: str | Path) -> int:
     """Get sample rate from audio file using soundfile
     :param name: File name
@@ -63,7 +65,7 @@ def get_sample_rate(name: str) -> int:
             raise SonusAIError(f'Error reading {name}: {e}')
-def read_ir(name: str) -> ImpulseResponseData:
+def read_ir(name: str | Path) -> ImpulseResponseData:
     """Read impulse response data using soundfile
     :param name: File name
@@ -79,10 +81,10 @@ def read_ir(name: str) -> ImpulseResponseData:
     out = out[offset:]
     out = out / np.linalg.norm(out)
-    return ImpulseResponseData(name=name, sample_rate=sample_rate, data=out)
+    return ImpulseResponseData(name=str(name), sample_rate=sample_rate, data=out)
-def read_audio(name: str) -> AudioT:
+def read_audio(name: str | Path) -> AudioT:
     """Read audio data from a file using soundfile
     :param name: File name
@@ -101,7 +103,7 @@ def read_audio(name: str) -> AudioT:
     return out
-def get_num_samples(name: str) -> int:
+def get_num_samples(name: str | Path) -> int:
     """Get the number of samples resampled to the SonusAI sample rate in the given file
     :param name: File name

sonusai/mixture/sox_audio.py CHANGED Viewed

@@ -1,16 +1,19 @@
+from pathlib import Path
+from typing import Optional
+import numpy as np
 from sox import Transformer as SoxTransformer
 from sonusai.mixture.datatypes import AudioT
 from sonusai.mixture.datatypes import ImpulseResponseData
-def read_impulse_response(name: str) -> ImpulseResponseData:
+def read_impulse_response(name: str | Path) -> ImpulseResponseData:
     """Read impulse response data using SoX
     :param name: File name
     :return: ImpulseResponseData object
     """
-    import numpy as np
     from scipy.io import wavfile
     from sonusai import SonusAIError
@@ -33,10 +36,10 @@ def read_impulse_response(name: str) -> ImpulseResponseData:
     data = data[offset:]
     data = data / np.linalg.norm(data)
-    return ImpulseResponseData(name=name, sample_rate=sample_rate, data=data)
+    return ImpulseResponseData(name=str(name), sample_rate=sample_rate, data=data)
-def read_audio(name: str) -> AudioT:
+def read_audio(name: str | Path) -> AudioT:
     """Read audio data from a file using SoX
     :param name: File name
@@ -44,7 +47,6 @@ def read_audio(name: str) -> AudioT:
     """
     from typing import Any
-    import numpy as np
     from sox.core import sox
     from sonusai import SonusAIError
@@ -208,8 +210,11 @@ class Transformer(SoxTransformer):
         return self
-    def build_array(self, input_filepath=None, input_array=None,
-                    sample_rate_in=None, extra_args=None):
+    def build_array(self,
+                    input_filepath: Optional[str | Path] = None,
+                    input_array: Optional[np.ndarray] = None,
+                    sample_rate_in: Optional[int] = None,
+                    extra_args: Optional[list[str]] = None) -> np.ndarray:
         """Given an input file or array, returns the output as a numpy array
         by executing the current set of commands. By default, the array will
         have the same sample rate as the input file unless otherwise specified
@@ -220,7 +225,7 @@ class Transformer(SoxTransformer):
         Parameters
         ----------
-        input_filepath : str or None
+        input_filepath : str, Path or None
             Either path to input audio file or None.
         input_array : np.ndarray or None
             A np.ndarray of a waveform with shape (n_samples, n_channels).
@@ -270,8 +275,6 @@ class Transformer(SoxTransformer):
         """
-        import numpy as np
         from sox.core import SoxError
         from sox.core import sox
         from sox.log import logger
@@ -324,13 +327,13 @@ class Transformer(SoxTransformer):
         match n_bits:
             case 8:
-                encoding_out = np.int8
+                encoding_out = np.int8  # type: ignore
             case 16:
                 encoding_out = np.int16
             case 32:
-                encoding_out = np.float32
+                encoding_out = np.float32  # type: ignore
             case 64:
-                encoding_out = np.float64
+                encoding_out = np.float64  # type: ignore
             case _:
                 raise ValueError("invalid n_bits {}".format(n_bits))

sonusai/mixture/torchaudio_audio.py CHANGED Viewed

@@ -1,8 +1,10 @@
+from pathlib import Path
 from sonusai.mixture.datatypes import AudioT
 from sonusai.mixture.datatypes import ImpulseResponseData
-def read_impulse_response(name: str) -> ImpulseResponseData:
+def read_impulse_response(name: str | Path) -> ImpulseResponseData:
     """Read impulse response data using torchaudio
     :param name: File name
@@ -36,10 +38,10 @@ def read_impulse_response(name: str) -> ImpulseResponseData:
     data = np.array(raw).astype(np.float32)
     data = data / np.linalg.norm(data)
-    return ImpulseResponseData(name=name, sample_rate=sample_rate, data=data)
+    return ImpulseResponseData(name=str(name), sample_rate=sample_rate, data=data)
-def get_sample_rate(name: str) -> int:
+def get_sample_rate(name: str | Path) -> int:
     """Get sample rate from audio file using torchaudio
     :param name: File name
@@ -61,7 +63,7 @@ def get_sample_rate(name: str) -> int:
             raise SonusAIError(f'Error reading {name}:\n{e}')
-def read_audio(name: str) -> AudioT:
+def read_audio(name: str | Path) -> AudioT:
     """Read audio data from a file using torchaudio
     :param name: File name

sonusai 0.18.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

sonusai 0.18.2py3-none-any.whl → 0.18.4py3-none-any.whl