PyPI - sonusai - Versions diffs - 0.18.6__py3-none-any.whl → 0.18.8__py3-none-any.whl - Mend

sonusai 0.18.6py3-none-any.whl → 0.18.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

sonusai/__init__.py +6 -1
sonusai/genmetrics.py +4 -4
sonusai/metrics/__init__.py +2 -1
sonusai/metrics/calc_audio_stats.py +9 -1
sonusai/metrics/calc_segsnr_f.py +84 -0
sonusai/metrics/calc_speech.py +5 -5
sonusai/mixture/__init__.py +3 -0
sonusai/mixture/datatypes.py +65 -6
sonusai/mixture/feature.py +4 -19
sonusai/mixture/helpers.py +50 -39
sonusai/mixture/mixdb.py +198 -59
sonusai/mixture/sox_audio.py +125 -0
sonusai/mixture/truth_functions/data.py +23 -22
sonusai/mixture/truth_functions/energy.py +3 -1
sonusai/mixture/truth_functions/sed.py +2 -1
sonusai/mixture/truth_functions/target.py +3 -4
sonusai/utils/__init__.py +2 -0
sonusai/utils/compress.py +25 -0
sonusai/utils/energy_f.py +3 -4
{sonusai-0.18.6.dist-info → sonusai-0.18.8.dist-info}/METADATA +1 -1
{sonusai-0.18.6.dist-info → sonusai-0.18.8.dist-info}/RECORD +23 -23
sonusai/metrics/calc_snr_f.py +0 -34
sonusai/post_spenh_targetf.py +0 -160
{sonusai-0.18.6.dist-info → sonusai-0.18.8.dist-info}/WHEEL +0 -0
{sonusai-0.18.6.dist-info → sonusai-0.18.8.dist-info}/entry_points.txt +0 -0

sonusai/mixture/mixdb.py CHANGED Viewed

@@ -17,6 +17,8 @@ from sonusai.mixture.datatypes import FeatureGeneratorConfig
 from sonusai.mixture.datatypes import FeatureGeneratorInfo
 from sonusai.mixture.datatypes import GeneralizedIDs
 from sonusai.mixture.datatypes import ImpulseResponseFiles
+from sonusai.mixture.datatypes import MetricDoc
+from sonusai.mixture.datatypes import MetricDocs
 from sonusai.mixture.datatypes import Mixture
 from sonusai.mixture.datatypes import Mixtures
 from sonusai.mixture.datatypes import NoiseFile
@@ -155,19 +157,69 @@ class MixtureDatabase:
             return json.loads(c.execute("SELECT top.asr_configs FROM top").fetchone()[0])
     @cached_property
-    def supported_metrics(self) -> set[str]:
-        metrics = {
-            'mxssnravg', 'mxssnrvar', 'mxssnrdavg', 'mxssnrdstd',
-            'mxpesq', 'mxcsig', 'mxcbak', 'mxcovl', 'mxwsdr',
-            'mxpd',
-            'mxstoi',
-            'tdco', 'tmin', 'tmax', 'tpkdb', 'tlrms', 'tpkr', 'ttr', 'tcr', 'tfl', 'tpkc',
-            'ndco', 'nmin', 'nmax', 'npkdb', 'nlrms', 'npkr', 'ntr', 'ncr', 'nfl', 'npkc',
-            'sedavg', 'sedcnt', 'sedtopn',
-            'ssnr',
-        }
+    def supported_metrics(self) -> MetricDocs:
+        metrics = MetricDocs([
+            MetricDoc('Mixture Metrics', 'mxsnr', 'SNR specification in dB'),
+            MetricDoc('Mixture Metrics', 'mxssnr_avg', 'Segmental SNR average over all frames'),
+            MetricDoc('Mixture Metrics', 'mxssnr_std', 'Segmental SNR standard deviation over all frames'),
+            MetricDoc('Mixture Metrics', 'mxssnrdb_avg',
+                      'Segmental SNR average of the dB frame values over all frames'),
+            MetricDoc('Mixture Metrics', 'mxssnrdb_std',
+                      'Segmental SNR standard deviation of the dB frame values over all frames'),
+            MetricDoc('Mixture Metrics', 'mxssnrf_avg',
+                      'Per-bin segmental SNR average over all frames (using feature transform)'),
+            MetricDoc('Mixture Metrics', 'mxssnrf_std',
+                      'Per-bin segmental SNR standard deviation over all frames (using feature transform)'),
+            MetricDoc('Mixture Metrics', 'mxssnrdbf_avg',
+                      'Per-bin segmental average of the dB frame values over all frames (using feature transform)'),
+            MetricDoc('Mixture Metrics', 'mxssnrdbf_std',
+                      'Per-bin segmental standard deviation of the dB frame values over all frames (using feature transform)'),
+            MetricDoc('Mixture Metrics', 'mxpesq', 'PESQ of mixture versus true target[0]'),
+            MetricDoc('Mixture Metrics', 'mxwsdr', 'Weighted signal distorion ratio of mixture versus true target[0]'),
+            MetricDoc('Mixture Metrics', 'mxpd', 'Phase distance between mixture and true target[0]'),
+            MetricDoc('Mixture Metrics', 'mxstoi',
+                      'Short term objective intelligibility of mixture versus true target[0]'),
+            MetricDoc('Mixture Metrics', 'mxcsig',
+                      'Predicted rating of speech distortion of mixture versus true target[0]'),
+            MetricDoc('Mixture Metrics', 'mxcbak',
+                      'Predicted rating of background distortion of mixture versus true target[0]'),
+            MetricDoc('Mixture Metrics', 'mxcovl',
+                      'Predicted rating of overall quality of mixture versus true target[0]'),
+            MetricDoc('Mixture Metrics', 'ssnr', 'Segmental SNR'),
+            MetricDoc('Target Metrics', 'tdco', 'Target[0] DC offset'),
+            MetricDoc('Target Metrics', 'tmin', 'Target[0] min level'),
+            MetricDoc('Target Metrics', 'tmax', 'Target[0] max levl'),
+            MetricDoc('Target Metrics', 'tpkdb', 'Target[0] Pk lev dB'),
+            MetricDoc('Target Metrics', 'tlrms', 'Target[0] RMS lev dB'),
+            MetricDoc('Target Metrics', 'tpkr', 'Target[0] RMS Pk dB'),
+            MetricDoc('Target Metrics', 'ttr', 'Target[0] RMS Tr dB'),
+            MetricDoc('Target Metrics', 'tcr', 'Target[0] Crest factor'),
+            MetricDoc('Target Metrics', 'tfl', 'Target[0] Flat factor'),
+            MetricDoc('Target Metrics', 'tpkc', 'Target[0] Pk count'),
+            MetricDoc('Noise Metrics', 'ndco', 'Noise DC offset'),
+            MetricDoc('Noise Metrics', 'nmin', 'Noise min level'),
+            MetricDoc('Noise Metrics', 'nmax', 'Noise max levl'),
+            MetricDoc('Noise Metrics', 'npkdb', 'Noise Pk lev dB'),
+            MetricDoc('Noise Metrics', 'nlrms', 'Noise RMS lev dB'),
+            MetricDoc('Noise Metrics', 'npkr', 'Noise RMS Pk dB'),
+            MetricDoc('Noise Metrics', 'ntr', 'Noise RMS Tr dB'),
+            MetricDoc('Noise Metrics', 'ncr', 'Noise Crest factor'),
+            MetricDoc('Noise Metrics', 'nfl', 'Noise Flat factor'),
+            MetricDoc('Noise Metrics', 'npkc', 'Noise Pk count'),
+            MetricDoc('Truth Metrics', 'sedavg',
+                      '(not implemented) Average SED activity over all frames [num_classes, 1]'),
+            MetricDoc('Truth Metrics', 'sedcnt',
+                      '(not implemented) Count in number of frames that SED is active [num_classes, 1]'),
+            MetricDoc('Truth Metrics', 'sedtop3', '(not implemented) 3 most active by largest sedavg [3, 1]'),
+            MetricDoc('Truth Metrics', 'sedtopn', '(not implemented) N most active by largest sedavg [N, 1]'),
+        ])
         for name in self.asr_configs:
-            metrics.add(f'mxwer.{name}')
+            metrics.append(MetricDoc('Target Metrics', f'tasr.{name}',
+                                     f'Target[0] ASR text using {name} ASR as defined in mixdb asr_configs parameter'))
+            metrics.append(MetricDoc('Mixture Metrics', f'mxasr.{name}',
+                                     f'ASR text using {name} ASR as defined in mixdb asr_configs parameter'))
+            metrics.append(MetricDoc('Mixture Metrics', f'mxwer.{name}',
+                                     f'Word error rate using {name} ASR as defined in mixdb asr_configs parameter'))
         return metrics
@@ -240,11 +292,15 @@ class MixtureDatabase:
     def total_feature_frames(self, m_ids: GeneralizedIDs = '*') -> int:
         return self.total_samples(m_ids) // self.feature_step_samples
-    def mixture_transform_frames(self, samples: int) -> int:
-        return samples // self.ft_config.R
+    def mixture_transform_frames(self, m_id: int) -> int:
+        from .helpers import frames_from_samples
-    def mixture_feature_frames(self, samples: int) -> int:
-        return samples // self.feature_step_samples
+        return frames_from_samples(self.mixture(m_id).samples, self.ft_config.R)
+    def mixture_feature_frames(self, m_id: int) -> int:
+        from .helpers import frames_from_samples
+        return frames_from_samples(self.mixture(m_id).samples, self.feature_step_samples)
     def mixids_to_list(self, m_ids: Optional[GeneralizedIDs] = None) -> list[int]:
         """Resolve generalized mixture IDs to a list of integers
@@ -907,8 +963,8 @@ class MixtureDatabase:
             truth_t = self.mixture_truth_t(m_id=m_id, targets=targets, noise=noise, force=force)
         m = self.mixture(m_id)
-        transform_frames = self.mixture_transform_frames(m.samples)
-        feature_frames = self.mixture_feature_frames(m.samples)
+        transform_frames = self.mixture_transform_frames(m_id)
+        feature_frames = self.mixture_feature_frames(m_id)
         if truth_t is None:
             truth_t = np.zeros((m.samples, self.num_classes), dtype=np.float32)
@@ -1133,7 +1189,7 @@ class MixtureDatabase:
     def mixture_metrics(self, m_id: int,
                         metrics: list[str],
-                        force: bool = False) -> list[float | int | Segsnr]:
+                        force: bool = False) -> list[float | int | str | Segsnr]:
         """Get metrics data for the given mixture ID
         :param m_id: Zero-based mixture ID
@@ -1149,7 +1205,8 @@ class MixtureDatabase:
         from sonusai import SonusAIError
         from sonusai.metrics import calc_audio_stats
         from sonusai.metrics import calc_phase_distance
-        from sonusai.metrics import calc_snr_f
+        from sonusai.metrics import calc_segsnr_f
+        from sonusai.metrics import calc_segsnr_f_bin
         from sonusai.metrics import calc_speech
         from sonusai.metrics import calc_wer
         from sonusai.metrics import calc_wsdr
@@ -1158,7 +1215,7 @@ class MixtureDatabase:
         from sonusai.mixture import SpeechMetrics
         from sonusai.utils import calc_asr
-        def create_target_audio() -> Callable:
+        def create_target_audio() -> Callable[[], np.ndarray]:
             state = None
             def get() -> np.ndarray:
@@ -1171,7 +1228,20 @@ class MixtureDatabase:
         target_audio = create_target_audio()
-        def create_noise_audio() -> Callable:
+        def create_target_f() -> Callable[[], np.ndarray]:
+            state = None
+            def get() -> np.ndarray:
+                nonlocal state
+                if state is None:
+                    state = self.mixture_targets_f(m_id)[0]
+                return state
+            return get
+        target_f = create_target_f()
+        def create_noise_audio() -> Callable[[], np.ndarray]:
             state = None
             def get() -> np.ndarray:
@@ -1184,7 +1254,20 @@ class MixtureDatabase:
         noise_audio = create_noise_audio()
-        def create_mixture_audio() -> Callable:
+        def create_noise_f() -> Callable[[], np.ndarray]:
+            state = None
+            def get() -> np.ndarray:
+                nonlocal state
+                if state is None:
+                    state = self.mixture_noise_f(m_id)
+                return state
+            return get
+        noise_f = create_noise_f()
+        def create_mixture_audio() -> Callable[[], np.ndarray]:
             state = None
             def get() -> np.ndarray:
@@ -1197,7 +1280,7 @@ class MixtureDatabase:
         mixture_audio = create_mixture_audio()
-        def create_segsnr_f() -> Callable:
+        def create_segsnr_f() -> Callable[[], np.ndarray]:
             state = None
             def get() -> np.ndarray:
@@ -1210,7 +1293,7 @@ class MixtureDatabase:
         segsnr_f = create_segsnr_f()
-        def create_speech() -> Callable:
+        def create_speech() -> Callable[[], SpeechMetrics]:
             state = None
             def get() -> SpeechMetrics:
@@ -1223,7 +1306,7 @@ class MixtureDatabase:
         speech = create_speech()
-        def create_target_stats() -> Callable:
+        def create_target_stats() -> Callable[[], AudioStatsMetrics]:
             state = None
             def get() -> AudioStatsMetrics:
@@ -1236,7 +1319,7 @@ class MixtureDatabase:
         target_stats = create_target_stats()
-        def create_noise_stats() -> Callable:
+        def create_noise_stats() -> Callable[[], AudioStatsMetrics]:
             state = None
             def get() -> AudioStatsMetrics:
@@ -1249,7 +1332,56 @@ class MixtureDatabase:
         noise_stats = create_noise_stats()
-        def calc(m: str) -> float | int | Segsnr:
+        def create_asr_config() -> Callable[[str], dict]:
+            state: dict[str, dict] = {}
+            def get(asr_name) -> dict:
+                nonlocal state
+                if asr_name not in state:
+                    state[asr_name] = self.asr_configs.get(asr_name, None)
+                    if state[asr_name] is None:
+                        raise SonusAIError(f"Unrecognized ASR name: '{asr_name}'")
+                return state[asr_name]
+            return get
+        asr_config = create_asr_config()
+        def create_target_asr() -> Callable[[str], str]:
+            state: dict[str, str] = {}
+            def get(asr_name) -> str:
+                nonlocal state
+                if asr_name not in state:
+                    state[asr_name] = calc_asr(target_audio(), **asr_config(asr_name)).text
+                return state[asr_name]
+            return get
+        target_asr = create_target_asr()
+        def create_mixture_asr() -> Callable[[str], str]:
+            state: dict[str, str] = {}
+            def get(asr_name) -> str:
+                nonlocal state
+                if asr_name not in state:
+                    state[asr_name] = calc_asr(mixture_audio(), **asr_config(asr_name)).text
+                return state[asr_name]
+            return get
+        mixture_asr = create_mixture_asr()
+        def get_asr_name(m: str) -> str:
+            parts = m.split('.')
+            if len(parts) != 2:
+                raise SonusAIError(
+                    f"Unrecognized format: '{m}'; must be of the form: '<metric>.<name>'")
+            asr_name = parts[1]
+            return asr_name
+        def calc(m: str) -> float | int | str | Segsnr:
             if m == 'mxsnr':
                 return self.mixture(m_id).snr
@@ -1261,42 +1393,44 @@ class MixtureDatabase:
             # Otherwise, generate data as needed
             if m.startswith('mxwer'):
-                parts = m.split('.')
-                if len(parts) != 2:
-                    raise SonusAIError(
-                        f"Unrecognized 'mxwer' format: '{m}'; must be of the form: 'mxwer.<name>'")
-                asr_name = parts[1]
-                asr_config = self.asr_configs.get(asr_name, None)
-                if asr_config is None:
-                    raise SonusAIError(f"Unrecognized metric: '{m}'")
+                asr_name = get_asr_name(m)
                 if self.mixture(m_id).snr < -96:
                     # noise only, ignore/reset target asr
                     return float('nan')
-                # ignore mixup
-                target_asr = self.mixture_speech_metadata(m_id, 'text')[0]
-                if target_asr is None:
-                    target_asr = calc_asr(target_audio(), **asr_config).text
-                if target_asr:
-                    mixture_asr = calc_asr(mixture_audio(), **asr_config).text
-                    return calc_wer(mixture_asr, target_asr).wer * 100
+                if target_asr(asr_name):
+                    return calc_wer(mixture_asr(asr_name), target_asr(asr_name)).wer * 100
                 # TODO: should this be NaN like above?
                 return float(0)
-            if m == 'mxssnravg':
-                return calc_snr_f(segsnr_f()).mean
+            if m.startswith('mxasr'):
+                return mixture_asr(get_asr_name(m))
+            if m == 'mxssnr_avg':
+                return calc_segsnr_f(segsnr_f()).avg
-            if m == 'mxssnrvar':
-                return calc_snr_f(segsnr_f()).var
+            if m == 'mxssnr_std':
+                return calc_segsnr_f(segsnr_f()).std
-            if m == 'mxssnrdavg':
-                return calc_snr_f(segsnr_f()).db_mean
+            if m == 'mxssnrdb_avg':
+                return calc_segsnr_f(segsnr_f()).db_avg
-            if m == 'mxssnrdstd':
-                return calc_snr_f(segsnr_f()).db_std
+            if m == 'mxssnrdb_std':
+                return calc_segsnr_f(segsnr_f()).db_std
+            if m == 'mxssnrf_avg':
+                return calc_segsnr_f_bin(target_f(), noise_f()).avg
+            if m == 'mxssnrf_std':
+                return calc_segsnr_f_bin(target_f(), noise_f()).std
+            if m == 'mxssnrdbf_avg':
+                return calc_segsnr_f_bin(target_f(), noise_f()).db_avg
+            if m == 'mxssnrdbf_std':
+                return calc_segsnr_f_bin(target_f(), noise_f()).db_std
             if m == 'mxpesq':
                 if self.mixture(m_id).snr < -96:
@@ -1306,17 +1440,17 @@ class MixtureDatabase:
             if m == 'mxcsig':
                 if self.mixture(m_id).snr < -96:
                     return 0
-                return speech().c_sig
+                return speech().csig
             if m == 'mxcbak':
                 if self.mixture(m_id).snr < -96:
                     return 0
-                return speech().c_bak
+                return speech().cbak
             if m == 'mxcovl':
                 if self.mixture(m_id).snr < -96:
                     return 0
-                return speech().c_ovl
+                return speech().covl
             if m == 'mxwsdr':
                 mixture = mixture_audio()[:, np.newaxis]
@@ -1328,8 +1462,7 @@ class MixtureDatabase:
             if m == 'mxpd':
                 mixture_f = self.mixture_mixture_f(m_id)
-                target_f = self.mixture_target_f(m_id)
-                return calc_phase_distance(hypothesis=mixture_f, reference=target_f)[0]
+                return calc_phase_distance(hypothesis=mixture_f, reference=target_f())[0]
             if m == 'mxstoi':
                 return stoi(x=target_audio(), y=mixture_audio(), fs_sig=SAMPLE_RATE, extended=False)
@@ -1364,6 +1497,9 @@ class MixtureDatabase:
             if m == 'tpkc':
                 return target_stats().pkc
+            if m.startswith('tasr'):
+                return target_asr(get_asr_name(m))
             if m == 'ndco':
                 return noise_stats().dco
@@ -1400,15 +1536,18 @@ class MixtureDatabase:
             if m == 'sedcnt':
                 return 0
+            if m == 'sedtop3':
+                return np.zeros(3, dtype=np.float32)
             if m == 'sedtopn':
                 return 0
             if m == 'ssnr':
-                return self.mixture_segsnr(m_id)
+                return segsnr_f()
             raise SonusAIError(f"Unrecognized metric: '{m}'")
-        result: list[float | int | Segsnr] = []
+        result: list[float | int | str | Segsnr] = []
         for metric in metrics:
             result.append(calc(metric))

sonusai/mixture/sox_audio.py CHANGED Viewed

@@ -210,6 +210,131 @@ class Transformer(SoxTransformer):
         return self
+    def build(self,
+              input_filepath: Optional[str | Path] = None,
+              output_filepath: Optional[str | Path] = None,
+              input_array: Optional[np.ndarray] = None,
+              sample_rate_in: Optional[float] = None,
+              extra_args: Optional[list[str]] = None,
+              return_output: bool = False) -> tuple[bool, Optional[str], Optional[str]]:
+        """Given an input file or array, creates an output_file on disk by
+        executing the current set of commands. This function returns True on
+        success. If return_output is True, this function returns a triple of
+        (status, out, err), giving the success state, along with stdout and
+        stderr returned by sox.
+        Parameters
+        ----------
+        input_filepath : str or None
+            Either path to input audio file or None for array input.
+        output_filepath : str
+            Path to desired output file. If a file already exists at
+            the given path, the file will be overwritten.
+            If '-n', no file is created.
+        input_array : np.ndarray or None
+            An np.ndarray of an waveform with shape (n_samples, n_channels).
+            sample_rate_in must also be provided.
+            If None, input_filepath must be specified.
+        sample_rate_in : int
+            Sample rate of input_array.
+            This argument is ignored if input_array is None.
+        extra_args : list or None, default=None
+            If a list is given, these additional arguments are passed to SoX
+            at the end of the list of effects.
+            Don't use this argument unless you know exactly what you're doing!
+        return_output : bool, default=False
+            If True, returns the status and information sent to stderr and
+            stdout as a tuple (status, stdout, stderr).
+            If output_filepath is None, return_output=True by default.
+            If False, returns True on success.
+        Returns
+        -------
+        status : bool
+            True on success.
+        out : str (optional)
+            This is not returned unless return_output is True.
+            When returned, captures the stdout produced by sox.
+        err : str (optional)
+            This is not returned unless return_output is True.
+            When returned, captures the stderr produced by sox.
+        Examples
+        --------
+        > import numpy as np
+        > import sox
+        > tfm = sox.Transformer()
+        > sample_rate = 44100
+        > y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
+        file in, file out - basic usage
+        > status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
+        file in, file out - equivalent usage
+        > status = tfm.build(
+                input_filepath='path/to/input.wav',
+                output_filepath='path/to/output.mp3'
+            )
+        array in, file out
+        > status = tfm.build(
+                input_array=y, sample_rate_in=sample_rate,
+                output_filepath='path/to/output.mp3'
+            )
+        """
+        from sox import file_info
+        from sox.core import SoxError
+        from sox.core import sox
+        from sox.log import logger
+        input_format, input_filepath = self._parse_inputs(
+            input_filepath, input_array, sample_rate_in
+        )
+        if output_filepath is None:
+            raise ValueError("output_filepath is not specified!")
+        # set output parameters
+        if input_filepath == output_filepath:
+            raise ValueError(
+                "input_filepath must be different from output_filepath."
+            )
+        file_info.validate_output_file(output_filepath)
+        args = []
+        args.extend(self.globals)
+        args.extend(self._input_format_args(input_format))
+        args.append(input_filepath)
+        args.extend(self._output_format_args(self.output_format))
+        args.append(output_filepath)
+        args.extend(self.effects)
+        if extra_args is not None:
+            if not isinstance(extra_args, list):
+                raise ValueError("extra_args must be a list.")
+            args.extend(extra_args)
+        status, out, err = sox(args, input_array, True)
+        if status != 0:
+            raise SoxError(
+                f"Stdout: {out}\nStderr: {err}"
+            )
+        logger.info(
+            "Created %s with effects: %s",
+            output_filepath,
+            " ".join(self.effects_log)
+        )
+        if return_output:
+            return status, out, err
+        return True, None, None
     def build_array(self,
                     input_filepath: Optional[str | Path] = None,
                     input_array: Optional[np.ndarray] = None,

sonusai/mixture/truth_functions/data.py CHANGED Viewed

@@ -3,13 +3,14 @@ from sonusai.mixture.datatypes import TruthFunctionConfig
 class Data:
-    def __init__(self, target_audio: AudioT,
+    def __init__(self,
+                 target_audio: AudioT,
                  noise_audio: AudioT,
                  mixture_audio: AudioT,
                  config: TruthFunctionConfig) -> None:
         import numpy as np
-        from pyaaware import AawareForwardTransform
-        from pyaaware import AawareInverseTransform
+        from sonusai import ForwardTransform
+        from sonusai import InverseTransform
         from pyaaware import FeatureGenerator
         from sonusai import SonusAIError
@@ -33,25 +34,25 @@ class Data:
         self.offsets = range(0, len(target_audio), self.frame_size)
         self.zero_based_indices = [x - 1 for x in config.index]
-        self.target_fft = AawareForwardTransform(N=fg.ftransform_N,
-                                                 R=fg.ftransform_R,
-                                                 bin_start=fg.bin_start,
-                                                 bin_end=fg.bin_end,
-                                                 ttype=fg.ftransform_ttype)
-        self.noise_fft = AawareForwardTransform(N=fg.ftransform_N,
-                                                R=fg.ftransform_R,
-                                                bin_start=fg.bin_start,
-                                                bin_end=fg.bin_end,
-                                                ttype=fg.ftransform_ttype)
-        self.mixture_fft = AawareForwardTransform(N=fg.ftransform_N,
-                                                  R=fg.ftransform_R,
-                                                  bin_start=fg.bin_start,
-                                                  bin_end=fg.bin_end,
-                                                  ttype=fg.ftransform_ttype)
-        self.swin = AawareInverseTransform(N=fg.itransform_N,
-                                           R=fg.itransform_R,
+        self.target_fft = ForwardTransform(N=fg.ftransform_N,
+                                           R=fg.ftransform_R,
                                            bin_start=fg.bin_start,
                                            bin_end=fg.bin_end,
-                                           ttype=fg.itransform_ttype,
-                                           gain=np.float32(1)).W
+                                           ttype=fg.ftransform_ttype)
+        self.noise_fft = ForwardTransform(N=fg.ftransform_N,
+                                          R=fg.ftransform_R,
+                                          bin_start=fg.bin_start,
+                                          bin_end=fg.bin_end,
+                                          ttype=fg.ftransform_ttype)
+        self.mixture_fft = ForwardTransform(N=fg.ftransform_N,
+                                            R=fg.ftransform_R,
+                                            bin_start=fg.bin_start,
+                                            bin_end=fg.bin_end,
+                                            ttype=fg.ftransform_ttype)
+        self.swin = InverseTransform(N=fg.itransform_N,
+                                     R=fg.itransform_R,
+                                     bin_start=fg.bin_start,
+                                     bin_end=fg.bin_end,
+                                     ttype=fg.itransform_ttype,
+                                     gain=np.float32(1)).W
         self.truth = np.zeros((len(target_audio), config.num_classes), dtype=np.float32)

sonusai/mixture/truth_functions/energy.py CHANGED Viewed

@@ -132,9 +132,11 @@ def energy_t(data: Data) -> Truth:
     will reflect the total energy over all bins regardless of the feature
     transform config.
     """
+    import torch
     from sonusai import SonusAIError
-    _, target_energy = data.target_fft.execute_all(data.target_audio)
+    target_energy = data.target_fft.execute_all(torch.from_numpy(data.target_audio))[1].numpy()
     if len(target_energy) != len(data.offsets):
         raise SonusAIError(f'Number of frames in target_energy, {len(target_energy)},'
                            f' is not number of frames in truth, {len(data.offsets)}')

sonusai/mixture/truth_functions/sed.py CHANGED Viewed

@@ -21,6 +21,7 @@ should be set to the number of sounds/classes to be detected + 1 for
 the other class.
     """
     import numpy as np
+    import torch
     from pyaaware import SED
     from sonusai import SonusAIError
@@ -48,7 +49,7 @@ the other class.
             mutex=data.config.mutex)
     target_audio = data.target_audio / data.config.target_gain
-    _, energy_t = data.target_fft.execute_all(target_audio)
+    energy_t = data.target_fft.execute_all(torch.from_numpy(target_audio))[1].numpy()
     if len(energy_t) != len(data.offsets):
         raise SonusAIError(f'Number of frames in energy_t, {len(energy_t)},'
                            f' is not number of frames in truth, {len(data.offsets)}')

sonusai/mixture/truth_functions/target.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from pyaaware import ForwardTransform
+from sonusai import ForwardTransform
 from sonusai.mixture.datatypes import AudioF
 from sonusai.mixture.datatypes import AudioT
@@ -98,7 +98,6 @@ Output shape: [:, 2 * bins] (stacked real, imag)
     for idx, offset in enumerate(data.offsets):
         target_freq, _ = data.target_fft.execute(
             np.multiply(data.target_audio[offset:offset + data.frame_size], data.swin))
-        target_freq = target_freq.transpose()
         indices = slice(offset, offset + data.frame_size)
         for index in data.zero_based_indices:
@@ -112,10 +111,10 @@ Output shape: [:, 2 * bins] (stacked real, imag)
 def _execute_fft(audio: AudioT, transform: ForwardTransform, expected_frames: int) -> AudioF:
+    import torch
     from sonusai import SonusAIError
-    freq, _ = transform.execute_all(audio)
-    freq = freq.transpose()
+    freq = transform.execute_all(torch.from_numpy(audio))[0].numpy()
     if len(freq) != expected_frames:
         raise SonusAIError(f'Number of frames, {len(freq)}, is not number of frames expected, {expected_frames}')
     return freq

sonusai 0.18.6__py3-none-any.whl → 0.18.8__py3-none-any.whl

sonusai 0.18.6py3-none-any.whl → 0.18.8py3-none-any.whl