PyPI - sonusai - Versions diffs - 0.15.6__py3-none-any.whl → 0.15.9__py3-none-any.whl - Mend

sonusai 0.15.6py3-none-any.whl → 0.15.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sonusai/audiofe.py +293 -0
sonusai/calc_metric_spenh.py +3 -3
sonusai/data_generator/dataset_from_mixdb.py +1 -1
sonusai/data_generator/keras_from_mixdb.py +1 -1
sonusai/genft.py +2 -1
sonusai/genmixdb.py +4 -4
sonusai/keras_predict.py +1 -1
sonusai/lsdb.py +2 -2
sonusai/main.py +2 -2
sonusai/mixture/__init__.py +3 -2
sonusai/mixture/audio.py +0 -34
sonusai/mixture/datatypes.py +1 -1
sonusai/mixture/feature.py +75 -21
sonusai/mixture/generation.py +3 -3
sonusai/mixture/helpers.py +60 -30
sonusai/mixture/log_duration_and_sizes.py +2 -2
sonusai/mixture/mixdb.py +13 -10
sonusai/mixture/spectral_mask.py +14 -14
sonusai/mixture/truth_functions/data.py +2 -0
sonusai/mixture/truth_functions/target.py +14 -6
sonusai/onnx_predict.py +1 -1
sonusai/plot.py +4 -4
sonusai/post_spenh_targetf.py +8 -8
sonusai/torchl_predict.py +71 -76
sonusai/utils/__init__.py +4 -0
sonusai/utils/audio_devices.py +41 -0
sonusai/utils/calculate_input_shape.py +3 -4
sonusai/utils/create_timestamp.py +5 -0
sonusai/utils/reshape.py +11 -11
sonusai/utils/wave.py +12 -5
{sonusai-0.15.6.dist-info → sonusai-0.15.9.dist-info}/METADATA +21 -18
{sonusai-0.15.6.dist-info → sonusai-0.15.9.dist-info}/RECORD +34 -32
{sonusai-0.15.6.dist-info → sonusai-0.15.9.dist-info}/WHEEL +1 -1
sonusai/evaluate.py +0 -245
{sonusai-0.15.6.dist-info → sonusai-0.15.9.dist-info}/entry_points.txt +0 -0

sonusai/audiofe.py ADDED Viewed

@@ -0,0 +1,293 @@
+"""sonusai audiofe
+usage: audiofe [-hvds] [--version] [-i INPUT] [-l LENGTH] [-m MODEL] [-k CKPT] [-a ASR] [-w WMODEL]
+options:
+    -h, --help
+    -v, --verbose                   Be verbose.
+    -d, --debug                     Write debug data to H5 file.
+    -s, --show                      Show a list of available audio inputs.
+    -i INPUT, --input INPUT         Input audio.
+    -l LENGTH, --length LENGTH      Length of audio in seconds. [default: -1].
+    -m MODEL, --model MODEL         PL model .py file path.
+    -k CKPT, --checkpoint CKPT      PL checkpoint file with weights.
+    -a ASR, --asr ASR               ASR method to use.
+    -w WMODEL, --whisper WMODEL     Whisper model used in aixplain_whisper and whisper methods. [default: tiny].
+Aaware SonusAI Audio Front End.
+Capture LENGTH seconds of audio from INPUT. If LENGTH is < 0, then capture until key is pressed. If INPUT is a valid
+audio file name, then use the audio data from the specified file. In this case, if LENGTH is < 0, process entire file;
+otherwise, process min(length(INPUT), LENGTH) seconds of audio from INPUT. Audio is saved to
+audiofe_capture_<TIMESTAMP>.wav.
+If a model is specified, run prediction on audio data from this model. Then compute the inverse transform of the
+prediction result and save to audiofe_predict_<TIMESTAMP>.wav.
+If an ASR is specified, run ASR on the captured audio and print the results. In addition, if a model was also specified,
+run ASR on the predict audio and print the results.
+If the debug option is enabled, write capture audio, feature, reconstruct audio, predict, and predict audio to
+audiofe_<TIMESTAMP>.h5.
+"""
+from os.path import exists
+from select import select
+from sys import stdin
+from typing import Any
+import h5py
+import numpy as np
+import pyaudio
+import torch
+from docopt import docopt
+from docopt import printable_usage
+import sonusai
+from sonusai import create_file_handler
+from sonusai import initial_log_messages
+from sonusai import logger
+from sonusai import update_console_handler
+from sonusai.mixture import AudioT
+from sonusai.mixture import CHANNEL_COUNT
+from sonusai.mixture import SAMPLE_RATE
+from sonusai.mixture import get_audio_from_feature
+from sonusai.mixture import get_feature_from_audio
+from sonusai.mixture import read_audio
+from sonusai.utils import calc_asr
+from sonusai.utils import create_timestamp
+from sonusai.utils import get_input_device_index_by_name
+from sonusai.utils import get_input_devices
+from sonusai.utils import import_keras_model
+from sonusai.utils import trim_docstring
+from sonusai.utils import write_wav
+def main() -> None:
+    args = docopt(trim_docstring(__doc__), version=sonusai.__version__, options_first=True)
+    ts = create_timestamp()
+    verbose = args['--verbose']
+    length = float(args['--length'])
+    input_name = args['--input']
+    model_name = args['--model']
+    ckpt_name = args['--checkpoint']
+    asr_name = args['--asr']
+    whisper_name = args['--whisper']
+    debug = args['--debug']
+    show = args['--show']
+    capture_name = f'audiofe_capture_{ts}.wav'
+    predict_name = f'audiofe_predict_{ts}.wav'
+    h5_name = f'audiofe_{ts}.h5'
+    if model_name is not None and ckpt_name is None:
+        print(printable_usage(trim_docstring(__doc__)))
+        exit(1)
+    # Setup logging file
+    create_file_handler('audiofe.log')
+    update_console_handler(verbose)
+    initial_log_messages('audiofe')
+    if show:
+        logger.info('List of available audio inputs:')
+        logger.info('')
+        p = pyaudio.PyAudio()
+        for name in get_input_devices(p):
+            logger.info(f'{name}')
+        logger.info('')
+        p.terminate()
+        return
+    if input_name is not None and exists(input_name):
+        capture_audio = get_frames_from_file(input_name, length)
+    else:
+        try:
+            capture_audio = get_frames_from_device(input_name, length)
+        except ValueError as e:
+            logger.exception(e)
+            return
+    write_wav(capture_name, capture_audio, SAMPLE_RATE)
+    logger.info('')
+    logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {capture_name}')
+    if debug:
+        with h5py.File(h5_name, 'a') as f:
+            if 'capture_audio' in f:
+                del f['capture_audio']
+            f.create_dataset('capture_audio', data=capture_audio)
+        logger.info(f'Wrote capture audio with shape {capture_audio.shape} to {h5_name}')
+    if asr_name is not None:
+        capture_asr = calc_asr(capture_audio, engine=asr_name, whisper_model_name=whisper_name).text
+        logger.info(f'Capture audio ASR: {capture_asr}')
+    if model_name is not None:
+        model = load_model(model_name=model_name, ckpt_name=ckpt_name)
+        feature = get_feature_from_audio(audio=capture_audio, feature_mode=model.hparams.feature)
+        if debug:
+            with h5py.File(h5_name, 'a') as f:
+                if 'feature' in f:
+                    del f['feature']
+                f.create_dataset('feature', data=feature)
+            logger.info(f'Wrote feature with shape {feature.shape} to {h5_name}')
+        # if debug:
+        #     reconstruct_name = f'audiofe_reconstruct_{ts}.wav'
+        #     reconstruct_audio = get_audio_from_feature(feature=feature, feature_mode=model.hparams.feature)
+        #     samples = min(len(capture_audio), len(reconstruct_audio))
+        #     max_err = np.max(np.abs(capture_audio[:samples] - reconstruct_audio[:samples]))
+        #     logger.info(f'Maximum error between capture and reconstruct: {max_err}')
+        #     write_wav(reconstruct_name, reconstruct_audio, SAMPLE_RATE)
+        #     logger.info(f'Wrote reconstruct audio with shape {reconstruct_audio.shape} to {reconstruct_name}')
+        #     with h5py.File(h5_name, 'a') as f:
+        #         if 'reconstruct_audio' in f:
+        #             del f['reconstruct_audio']
+        #         f.create_dataset('reconstruct_audio', data=reconstruct_audio)
+        #     logger.info(f'Wrote reconstruct audio with shape {reconstruct_audio.shape} to {h5_name}')
+        with torch.no_grad():
+            predict = model(torch.tensor(feature))
+        if debug:
+            with h5py.File(h5_name, 'a') as f:
+                if 'predict' in f:
+                    del f['predict']
+                f.create_dataset('predict', data=predict)
+            logger.info(f'Wrote predict with shape {predict.shape} to {h5_name}')
+        predict_audio = get_audio_from_feature(feature=predict.numpy(), feature_mode=model.hparams.feature)
+        write_wav(predict_name, predict_audio, SAMPLE_RATE)
+        logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {predict_name}')
+        if debug:
+            with h5py.File(h5_name, 'a') as f:
+                if 'predict_audio' in f:
+                    del f['predict_audio']
+                f.create_dataset('predict_audio', data=predict_audio)
+            logger.info(f'Wrote predict audio with shape {predict_audio.shape} to {h5_name}')
+        if asr_name is not None:
+            predict_asr = calc_asr(predict_audio, engine=asr_name, whisper_model_name=whisper_name).text
+            logger.info(f'Predict audio ASR: {predict_asr}')
+def load_model(model_name: str, ckpt_name: str) -> Any:
+    batch_size = 1
+    timesteps = 0
+    # Load checkpoint first to get hparams if available
+    try:
+        checkpoint = torch.load(ckpt_name, map_location=lambda storage, loc: storage)
+    except Exception as e:
+        logger.exception(f'Error: could not load checkpoint from {ckpt_name}: {e}')
+        raise SystemExit(1)
+    # Import model definition file
+    logger.info(f'Importing {model_name}')
+    litemodule = import_keras_model(model_name)
+    if 'hyper_parameters' in checkpoint:
+        logger.info(f'Found checkpoint file with hyper-parameters')
+        hparams = checkpoint['hyper_parameters']
+        if hparams['batch_size'] != batch_size:
+            logger.info(
+                f'Overriding model default batch_size of {hparams["batch_size"]} with batch_size of {batch_size}')
+            hparams["batch_size"] = batch_size
+        if hparams['timesteps'] != 0 and timesteps == 0:
+            timesteps = hparams['timesteps']
+            logger.warning(f'Using model default timesteps of {timesteps}')
+        logger.info(f'Building model with {len(hparams)} total hparams')
+        try:
+            model = litemodule.MyHyperModel(**hparams)
+        except Exception as e:
+            logger.exception(f'Error: model build (MyHyperModel) in {model_name} failed: {e}')
+            raise SystemExit(1)
+    else:
+        logger.info(f'Found checkpoint file with no hyper-parameters')
+        logger.info(f'Building model with defaults')
+        try:
+            tmp = litemodule.MyHyperModel()
+        except Exception as e:
+            logger.exception(f'Error: model build (MyHyperModel) in {model_name} failed: {e}')
+            raise SystemExit(1)
+        if tmp.batch_size != batch_size:
+            logger.info(f'Overriding model default batch_size of {tmp.batch_size} with batch_size of {batch_size}')
+        if tmp.timesteps != 0 and timesteps == 0:
+            timesteps = tmp.timesteps
+            logger.warning(f'Using model default timesteps of {timesteps}')
+        model = litemodule.MyHyperModel(timesteps=timesteps, batch_size=batch_size)
+    logger.info(f'Loading weights from {ckpt_name}')
+    model.load_state_dict(checkpoint["state_dict"])
+    model.eval()
+    return model
+def get_frames_from_device(input_name: str | None, length: float, chunk: int = 1024) -> AudioT:
+    p = pyaudio.PyAudio()
+    input_devices = get_input_devices(p)
+    if not input_devices:
+        raise ValueError('No input audio devices found')
+    if input_name is None:
+        input_name = input_devices[0]
+    try:
+        device_index = get_input_device_index_by_name(p, input_name)
+    except ValueError:
+        msg = f'Could not find {input_name}\n'
+        msg += f'Available devices:\n'
+        for input_device in input_devices:
+            msg += f'  {input_device}\n'
+        raise ValueError(msg)
+    logger.info(f'Capturing from {p.get_device_info_by_index(device_index).get("name")}')
+    stream = p.open(format=pyaudio.paFloat32,
+                    channels=CHANNEL_COUNT,
+                    rate=SAMPLE_RATE,
+                    input=True,
+                    input_device_index=device_index)
+    stream.start_stream()
+    print()
+    print('+---------------------------------+')
+    print('| Press Enter to stop             |')
+    print('+---------------------------------+')
+    print()
+    elapsed = 0.0
+    seconds_per_chunk = float(chunk) / float(SAMPLE_RATE)
+    raw_frames = []
+    while elapsed < length or length == -1:
+        raw_frames.append(stream.read(num_frames=chunk, exception_on_overflow=False))
+        elapsed += seconds_per_chunk
+        if select([stdin, ], [], [], 0)[0]:
+            stdin.read(1)
+            length = elapsed
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+    frames = np.frombuffer(b''.join(raw_frames), dtype=np.float32)
+    return frames
+def get_frames_from_file(input_name: str, length: float) -> AudioT:
+    logger.info(f'Capturing from {input_name}')
+    frames = read_audio(input_name)
+    if length != -1:
+        num_frames = int(length * SAMPLE_RATE)
+        if len(frames) > num_frames:
+            frames = frames[:num_frames]
+    return frames
+if __name__ == '__main__':
+    main()

sonusai/calc_metric_spenh.py CHANGED Viewed

@@ -978,11 +978,11 @@ def _process_mixture(mixid: int) -> tuple[pd.DataFrame, pd.DataFrame]:
         plot_fname = base_name + '_metric_spenh.pdf'
         # Reshape feature to eliminate overlap redundancy for easier to understand spectrogram view
-        # Original size (frames, stride, num_bands), decimates in stride dimension only if step is > 1
-        # Reshape to get frames*decimated_stride, num_bands
+        # Original size (frames, stride, feature_parameters), decimates in stride dimension only if step is > 1
+        # Reshape to get frames*decimated_stride, feature_parameters
         step = int(mixdb.feature_samples / mixdb.feature_step_samples)
         if feature.ndim != 3:
-            raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, num_bands')
+            raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, feature_parameters')
         # for feature cn*00n**
         feat_sgram = unstack_complex(feature)

sonusai/data_generator/dataset_from_mixdb.py CHANGED Viewed

@@ -42,7 +42,7 @@ class DatasetFromMixtureDatabase(Sequence):
         self.add1ch = add1ch
         self.shuffle = shuffle
         self.stride = self.mixdb.fg_stride
-        self.num_bands = self.mixdb.fg_num_bands
+        self.feature_parameters = self.mixdb.feature_parameters
         self.num_classes = self.mixdb.num_classes
         self.mixture_frame_segments = None
         self.batch_frame_segments = None

sonusai/data_generator/keras_from_mixdb.py CHANGED Viewed

@@ -61,7 +61,7 @@ class KerasFromMixtureDatabase(Sequence):
         self.add1ch = add1ch
         self.shuffle = shuffle
         self.stride = self.mixdb.fg_stride
-        self.num_bands = self.mixdb.fg_num_bands
+        self.feature_parameters = self.mixdb.feature_parameters
         self.num_classes = self.mixdb.num_classes
         self.mixture_frame_segments: Optional[int] = None
         self.batch_frame_segments: Optional[int] = None

sonusai/genft.py CHANGED Viewed

@@ -165,7 +165,8 @@ def main() -> None:
     logger.info(f'Wrote {len(mixids)} mixtures to {location}')
     logger.info('')
     logger.info(f'Duration: {seconds_to_hms(seconds=duration)}')
-    logger.info(f'feature:  {human_readable_size(total_feature_frames * mixdb.fg_stride * mixdb.fg_num_bands * 4, 1)}')
+    logger.info(
+        f'feature:  {human_readable_size(total_feature_frames * mixdb.fg_stride * mixdb.feature_parameters * 4, 1)}')
     logger.info(f'truth_f:  {human_readable_size(total_feature_frames * mixdb.num_classes * 4, 1)}')
     if compute_segsnr:
         logger.info(f'segsnr:   {human_readable_size(total_transform_frames * 4, 1)}')

sonusai/genmixdb.py CHANGED Viewed

@@ -337,12 +337,12 @@ def genmixdb(location: str,
         log_duration_and_sizes(total_duration=total_duration,
                                num_classes=mixdb.num_classes,
                                feature_step_samples=mixdb.feature_step_samples,
-                               num_bands=mixdb.fg_num_bands,
+                               feature_parameters=mixdb.feature_parameters,
                                stride=mixdb.fg_stride,
                                desc='Estimated')
         logger.info(f'Feature shape:        '
-                    f'{mixdb.fg_stride} x {mixdb.fg_num_bands} '
-                    f'({mixdb.fg_stride * mixdb.fg_num_bands} total params)')
+                    f'{mixdb.fg_stride} x {mixdb.feature_parameters} '
+                    f'({mixdb.fg_stride * mixdb.feature_parameters} total params)')
         logger.info(f'Feature samples:      {mixdb.feature_samples} samples ({mixdb.feature_ms} ms)')
         logger.info(f'Feature step samples: {mixdb.feature_step_samples} samples ({mixdb.feature_step_ms} ms)')
         logger.info('')
@@ -371,7 +371,7 @@ def genmixdb(location: str,
         log_duration_and_sizes(total_duration=total_duration,
                                num_classes=mixdb.num_classes,
                                feature_step_samples=mixdb.feature_step_samples,
-                               num_bands=mixdb.fg_num_bands,
+                               feature_parameters=mixdb.feature_parameters,
                                stride=mixdb.fg_stride,
                                desc='Actual')
         logger.info('')

sonusai/keras_predict.py CHANGED Viewed

@@ -180,7 +180,7 @@ def main() -> None:
     for file in input_name:
         # Convert WAV to feature data
         audio = read_audio(file)
-        feature = get_feature_from_audio(audio=audio, feature=hypermodel.feature)
+        feature = get_feature_from_audio(audio=audio, feature_mode=hypermodel.feature)
         feature, predict = _pad_and_predict(hypermodel=hypermodel,
                                             built_model=built_model,

sonusai/lsdb.py CHANGED Viewed

@@ -48,8 +48,8 @@ def lsdb(mixdb: MixtureDatabase,
     logger.info(f'{"Targets":{desc_len}} {mixdb.num_target_files}')
     logger.info(f'{"Noises":{desc_len}} {mixdb.num_noise_files}')
     logger.info(f'{"Feature":{desc_len}} {mixdb.feature}')
-    logger.info(f'{"Feature shape":{desc_len}} {mixdb.fg_stride} x {mixdb.fg_num_bands} '
-                f'({mixdb.fg_stride * mixdb.fg_num_bands} total params)')
+    logger.info(f'{"Feature shape":{desc_len}} {mixdb.fg_stride} x {mixdb.feature_parameters} '
+                f'({mixdb.fg_stride * mixdb.feature_parameters} total params)')
     logger.info(f'{"Feature samples":{desc_len}} {mixdb.feature_samples} samples ({mixdb.feature_ms} ms)')
     logger.info(f'{"Feature step samples":{desc_len}} {mixdb.feature_step_samples} samples '
                 f'({mixdb.feature_step_ms} ms)')

sonusai/main.py CHANGED Viewed

@@ -3,9 +3,9 @@
 usage: sonusai [--version] [--help] <command> [<args>...]
 The sonusai commands are:
+   audiofe                      Audio front end
    calc_metric_spenh            Run speech enhancement and analysis
    doc                          Documentation
-   evaluate                     Evaluate model performance
    genft                        Generate feature and truth data
    genmix                       Generate mixture and truth data
    genmixdb                     Generate a mixture database
@@ -39,9 +39,9 @@ def main() -> None:
     from sonusai.utils import trim_docstring
     commands = (
+        'audiofe',
         'calc_metric_spenh',
         'doc',
-        'evaluate',
         'genft',
         'genmix',
         'genmixdb',

sonusai/mixture/__init__.py CHANGED Viewed

@@ -1,6 +1,4 @@
 # SonusAI mixture utilities
-from .audio import calculate_audio_from_transform
-from .audio import calculate_transform_from_audio
 from .audio import get_duration
 from .audio import get_next_noise
 from .audio import get_num_samples
@@ -83,6 +81,7 @@ from .datatypes import TruthFunctionConfig
 from .datatypes import TruthSetting
 from .datatypes import TruthSettings
 from .datatypes import UniversalSNR
+from .feature import get_audio_from_feature
 from .feature import get_feature_from_audio
 from .generation import generate_mixtures
 from .generation import get_all_snrs_from_config
@@ -102,8 +101,10 @@ from .helpers import augmented_noise_samples
 from .helpers import augmented_target_samples
 from .helpers import check_audio_files_exist
 from .helpers import forward_transform
+from .helpers import get_audio_from_transform
 from .helpers import get_ft
 from .helpers import get_segsnr
+from .helpers import get_transform_from_audio
 from .helpers import get_truth_t
 from .helpers import inverse_transform
 from .helpers import mixture_metadata

sonusai/mixture/audio.py CHANGED Viewed

@@ -1,11 +1,6 @@
 from functools import lru_cache
-from pyaaware import ForwardTransform
-from pyaaware import InverseTransform
-from sonusai.mixture.datatypes import AudioF
 from sonusai.mixture.datatypes import AudioT
-from sonusai.mixture.datatypes import EnergyT
 from sonusai.mixture.datatypes import ImpulseResponseData
@@ -22,35 +17,6 @@ def get_next_noise(audio: AudioT, offset: int, length: int) -> AudioT:
     return np.take(audio, range(offset, offset + length), mode='wrap')
-def calculate_transform_from_audio(audio: AudioT,
-                                   transform: ForwardTransform) -> tuple[AudioF, EnergyT]:
-    """Apply forward transform to input audio data to generate transform data
-    :param audio: Time domain data [samples]
-    :param transform: ForwardTransform object
-    :return: Frequency domain data [frames, bins], Energy [frames]
-    """
-    f, e = transform.execute_all(audio)
-    return f.transpose(), e
-def calculate_audio_from_transform(data: AudioF,
-                                   transform: InverseTransform,
-                                   trim: bool = True) -> tuple[AudioT, EnergyT]:
-    """Apply inverse transform to input transform data to generate audio data
-    :param data: Frequency domain data [frames, bins]
-    :param transform: InverseTransform object
-    :param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the transform
-    :return: Time domain data [samples], Energy [frames]
-    """
-    t, e = transform.execute_all(data.transpose())
-    if trim:
-        t = t[transform.N - transform.R:]
-    return t, e
 def get_duration(audio: AudioT) -> float:
     """Get duration of audio in seconds

sonusai/mixture/datatypes.py CHANGED Viewed

@@ -304,7 +304,7 @@ class FeatureGeneratorInfo:
     decimation: int
     stride: int
     step: int
-    num_bands: int
+    feature_parameters: int
     ft_config: TransformConfig
     eft_config: TransformConfig
     it_config: TransformConfig

sonusai/mixture/feature.py CHANGED Viewed

@@ -1,51 +1,105 @@
+from typing import Optional
+from sonusai.mixture.datatypes import AudioF
 from sonusai.mixture.datatypes import AudioT
 from sonusai.mixture.datatypes import Feature
-def get_feature_from_audio(audio: AudioT, feature: str) -> Feature:
-    from dataclasses import asdict
+def get_feature_from_audio(audio: AudioT,
+                           feature_mode: str,
+                           num_classes: Optional[int] = 1,
+                           truth_mutex: Optional[bool] = False) -> Feature:
+    """Apply forward transform and generate feature data from audio data
+    :param audio: Time domain audio data [samples]
+    :param feature_mode: Feature mode
+    :param num_classes: Number of classes
+    :param truth_mutex: Whether to calculate 'other' label
+    :return: Feature data [frames, strides, feature_parameters]
+    """
     import numpy as np
     from pyaaware import FeatureGenerator
     from .augmentation import pad_audio_to_frame
-    from .datatypes import FeatureGeneratorConfig
     from .datatypes import TransformConfig
     from .helpers import forward_transform
-    from .truth import truth_reduction
-    num_classes = 1
-    truth_mutex = False
-    truth_reduction_function = 'max'
+    fg = FeatureGenerator(feature_mode=feature_mode,
+                          num_classes=num_classes,
+                          truth_mutex=truth_mutex)
-    fg_config = FeatureGeneratorConfig(feature_mode=feature,
-                                       num_classes=num_classes,
-                                       truth_mutex=truth_mutex)
-    fg = FeatureGenerator(**asdict(fg_config))
     feature_step_samples = fg.ftransform_R * fg.decimation * fg.step
     audio = pad_audio_to_frame(audio, feature_step_samples)
-    samples = len(audio)
-    audio_f = forward_transform(audio, TransformConfig(N=fg.ftransform_N,
+    audio_f = forward_transform(audio=audio,
+                                config=TransformConfig(N=fg.ftransform_N,
                                                        R=fg.ftransform_R,
                                                        bin_start=fg.bin_start,
                                                        bin_end=fg.bin_end,
                                                        ttype=fg.ftransform_ttype))
+    samples = len(audio)
     transform_frames = samples // fg.ftransform_R
     feature_frames = samples // feature_step_samples
-    truth_t = np.empty((samples, num_classes), dtype=np.float32)
-    data = np.empty((feature_frames, fg.stride, fg.num_bands), dtype=np.float32)
+    feature = np.empty((feature_frames, fg.stride, fg.feature_parameters), dtype=np.float32)
     feature_frame = 0
     for transform_frame in range(transform_frames):
-        indices = slice(transform_frame * fg.ftransform_R, (transform_frame + 1) * fg.ftransform_R)
-        fg.execute(audio_f[transform_frame], truth_reduction(truth_t[indices], truth_reduction_function))
+        fg.execute(audio_f[transform_frame])
         if fg.eof():
-            data[feature_frame] = fg.feature()
+            feature[feature_frame] = fg.feature()
             feature_frame += 1
-    return data
+    return feature
+def get_audio_from_feature(feature: Feature,
+                           feature_mode: str,
+                           num_classes: Optional[int] = 1,
+                           truth_mutex: Optional[bool] = False,
+                           trim: Optional[bool] = True) -> AudioT:
+    """Apply inverse transform to feature data to generate audio data
+    :param feature: Feature data [frames, strides, feature_parameters]
+    :param feature_mode: Feature mode
+    :param num_classes: Number of classes
+    :param truth_mutex: Whether to calculate 'other' label
+    :param trim: Whether to trim the audio data
+    :return: Audio data [samples]
+    """
+    import numpy as np
+    from pyaaware import FeatureGenerator
+    from .datatypes import TransformConfig
+    from .helpers import inverse_transform
+    from sonusai.utils.stacked_complex import unstack_complex
+    fg = FeatureGenerator(feature_mode=feature_mode,
+                          num_classes=num_classes,
+                          truth_mutex=truth_mutex)
+    feature_complex = unstack_complex(feature)
+    if feature_mode[0:1] == 'h':
+        feature_complex = _power_uncompress(feature_complex)
+    return np.squeeze(inverse_transform(transform=feature_complex,
+                                        config=TransformConfig(N=fg.itransform_N,
+                                                               R=fg.itransform_R,
+                                                               bin_start=fg.bin_start,
+                                                               bin_end=fg.bin_end,
+                                                               ttype=fg.itransform_ttype),
+                                        trim=trim))
+def _power_uncompress(feature: AudioF) -> AudioF:
+    import numpy as np
+    mag = np.abs(feature)
+    phase = np.angle(feature)
+    mag = mag ** (1. / 0.3)
+    real_uncompress = mag * np.cos(phase)
+    imag_uncompress = mag * np.sin(phase)
+    return real_uncompress + 1j * imag_uncompress

sonusai/mixture/generation.py CHANGED Viewed

@@ -433,7 +433,7 @@ def _initialize_targets_audio(mixdb: MixtureDatabase, mixture: Mixture) -> tuple
         # target_gain is used to back out the gain augmentation in order to return the target audio
         # to its normalized level when calculating truth (if needed).
         if target.augmentation.gain is not None:
-            target.gain = round(10 ** (target.augmentation.gain / 20), ndigits=7)
+            target.gain = round(10 ** (target.augmentation.gain / 20), ndigits=5)
         else:
             target.gain = 1
@@ -507,8 +507,8 @@ def _initialize_mixture_gains(mixdb: MixtureDatabase,
         mixture.target_snr_gain *= gain_adjustment
         mixture.noise_snr_gain *= gain_adjustment
-    mixture.target_snr_gain = round(mixture.target_snr_gain, ndigits=7)
-    mixture.noise_snr_gain = round(mixture.noise_snr_gain, ndigits=7)
+    mixture.target_snr_gain = round(mixture.target_snr_gain, ndigits=5)
+    mixture.noise_snr_gain = round(mixture.noise_snr_gain, ndigits=5)
     return mixture

sonusai 0.15.6__py3-none-any.whl → 0.15.9__py3-none-any.whl

sonusai 0.15.6py3-none-any.whl → 0.15.9py3-none-any.whl