PyPI - sonusai - Versions diffs - 0.19.9__py3-none-any.whl → 0.20.2__py3-none-any.whl - Mend

sonusai 0.19.9py3-none-any.whl → 0.20.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

sonusai/calc_metric_spenh.py +265 -233
sonusai/data/genmixdb.yml +4 -2
sonusai/data/silero_vad_v5.1.jit +0 -0
sonusai/data/silero_vad_v5.1.onnx +0 -0
sonusai/doc/doc.py +14 -0
sonusai/genft.py +1 -1
sonusai/genmetrics.py +15 -18
sonusai/genmix.py +1 -1
sonusai/genmixdb.py +30 -52
sonusai/ir_metric.py +555 -0
sonusai/metrics_summary.py +322 -0
sonusai/mixture/__init__.py +6 -2
sonusai/mixture/audio.py +139 -15
sonusai/mixture/augmentation.py +199 -84
sonusai/mixture/config.py +9 -4
sonusai/mixture/constants.py +0 -1
sonusai/mixture/datatypes.py +19 -10
sonusai/mixture/generation.py +52 -64
sonusai/mixture/helpers.py +38 -26
sonusai/mixture/ir_delay.py +63 -0
sonusai/mixture/mixdb.py +190 -46
sonusai/mixture/targets.py +3 -6
sonusai/mixture/truth_functions/energy.py +9 -5
sonusai/mixture/truth_functions/metrics.py +1 -1
sonusai/mkwav.py +1 -1
sonusai/onnx_predict.py +1 -1
sonusai/queries/queries.py +1 -1
sonusai/utils/__init__.py +2 -0
sonusai/utils/asr.py +1 -1
sonusai/utils/load_object.py +8 -2
sonusai/utils/stratified_shuffle_split.py +1 -1
sonusai/utils/temp_seed.py +13 -0
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/METADATA +2 -2
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/RECORD +36 -35
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/WHEEL +1 -1
sonusai/mixture/soundfile_audio.py +0 -130
sonusai/mixture/sox_audio.py +0 -476
sonusai/mixture/sox_augmentation.py +0 -136
sonusai/mixture/torchaudio_audio.py +0 -106
sonusai/mixture/torchaudio_augmentation.py +0 -109
{sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/entry_points.txt +0 -0

sonusai/mixture/sox_audio.py DELETED Viewed

@@ -1,476 +0,0 @@
-from pathlib import Path
-import numpy as np
-from sox import Transformer as SoxTransformer
-from sonusai.mixture.datatypes import AudioT
-from sonusai.mixture.datatypes import ImpulseResponseData
-def read_impulse_response(name: str | Path) -> ImpulseResponseData:
-    """Read impulse response data using SoX
-    :param name: File name
-    :return: ImpulseResponseData object
-    """
-    from scipy.io import wavfile
-    from .datatypes import ImpulseResponseData
-    from .tokenized_shell_vars import tokenized_expand
-    expanded_name, _ = tokenized_expand(name)
-    # Read impulse response data from audio file
-    try:
-        sample_rate, data = wavfile.read(expanded_name)
-    except Exception as e:
-        if name != expanded_name:
-            raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
-        else:
-            raise OSError(f"Error reading {name}: {e}") from e
-    data = data.astype(np.float32)
-    offset = np.argmax(data)
-    data = data[offset:]
-    data = data / np.linalg.norm(data)
-    return ImpulseResponseData(name=str(name), sample_rate=sample_rate, data=data)
-def read_audio(name: str | Path) -> AudioT:
-    """Read audio data from a file using SoX
-    :param name: File name
-    :return: Array of time domain audio data
-    """
-    from typing import Any
-    from sox.core import sox
-    from .constants import BIT_DEPTH
-    from .constants import CHANNEL_COUNT
-    from .constants import ENCODING
-    from .constants import SAMPLE_RATE
-    from .tokenized_shell_vars import tokenized_expand
-    def encode_output(buffer: Any) -> np.ndarray:
-        from .constants import BIT_DEPTH
-        from .constants import ENCODING
-        if BIT_DEPTH == 8:
-            return np.frombuffer(buffer, dtype=np.int8)
-        if BIT_DEPTH == 16:
-            return np.frombuffer(buffer, dtype=np.int16)
-        if BIT_DEPTH == 24:
-            return np.frombuffer(buffer, dtype=np.int32)
-        if BIT_DEPTH == 32:
-            if ENCODING == "floating-point":
-                return np.frombuffer(buffer, dtype=np.float32)
-            return np.frombuffer(buffer, dtype=np.int32)
-        if BIT_DEPTH == 64:
-            return np.frombuffer(buffer, dtype=np.float64)
-        raise ValueError(f"Invalid BIT_DEPTH {BIT_DEPTH}")
-    expanded_name, _ = tokenized_expand(name)
-    try:
-        # Read in and convert to desired format
-        # NOTE: pysox format transformations do not handle encoding properly; need to use direct call to sox instead
-        args = [
-            "-D",
-            "-G",
-            expanded_name,
-            "-t",
-            "raw",
-            "-r",
-            str(SAMPLE_RATE),
-            "-b",
-            str(BIT_DEPTH),
-            "-c",
-            str(CHANNEL_COUNT),
-            "-e",
-            ENCODING,
-            "-",
-            "remix",
-            "1",
-        ]
-        status, out, err = sox(args, None, False)
-        if status != 0:
-            raise RuntimeError(f"sox stdout: {out}\nsox stderr: {err}")  # noqa: TRY301
-        return encode_output(out)
-    except Exception as e:
-        if name != expanded_name:
-            raise OSError(f"Error reading {name} (expanded: {expanded_name}):\n{e}") from e
-        else:
-            raise OSError(f"Error reading {name}:\n{e}") from e
-class Transformer(SoxTransformer):
-    """Override certain sox.Transformer methods"""
-    def fir(self, coefficients):
-        """Use SoX's FFT convolution engine with given FIR filter coefficients.
-        The SonusAI override allows coefficients to be either a list of numbers
-        or a string containing a text file with the coefficients.
-        Parameters
-        ----------
-        coefficients : list or str
-            fir filter coefficients
-        """
-        from sox.core import is_number
-        if not isinstance(coefficients, list) and not isinstance(coefficients, str):
-            raise TypeError("coefficients must be a list or a str.")
-        if isinstance(coefficients, list) and not all(is_number(c) for c in coefficients):
-            raise TypeError("coefficients list must be numbers.")
-        effect_args = ["fir"]
-        if isinstance(coefficients, list):
-            effect_args.extend([f"{c:f}" for c in coefficients])
-        else:
-            effect_args.append(coefficients)
-        self.effects.extend(effect_args)
-        self.effects_log.append("fir")
-        return self
-    def tempo(self, factor, audio_type=None, quick=False):
-        """Time stretch audio without changing pitch.
-        This effect uses the WSOLA algorithm. The audio is chopped up into
-        segments which are then shifted in the time domain and overlapped
-        (cross-faded) at points where their waveforms are most similar as
-        determined by measurement of least squares.
-        The SonusAI override does not generate a warning for small factors.
-        The sox.Transformer's implementation of stretch does not invert
-        the factor even though it says that it does; this invalidates the
-        factor size check and produces the wrong result.
-        Parameters
-        ----------
-        factor : float
-            The ratio of new tempo to the old tempo.
-            For ex. 1.1 speeds up the tempo by 10%; 0.9 slows it down by 10%.
-        audio_type : str
-            Type of audio, which optimizes algorithm parameters. One of:
-             * m : Music,
-             * s : Speech,
-             * l : Linear (useful when factor is close to 1),
-        quick : bool, default=False
-            If True, this effect will run faster but with lower sound quality.
-        See Also
-        --------
-        stretch, speed, pitch
-        """
-        from sox.core import is_number
-        from sox.log import logger
-        if not is_number(factor) or factor <= 0:
-            raise ValueError("factor must be a positive number")
-        if factor < 0.5 or factor > 2:
-            logger.warning("Using an extreme time stretching factor. Quality of results will be poor")
-        if audio_type not in [None, "m", "s", "l"]:
-            raise ValueError("audio_type must be one of None, 'm', 's', or 'l'.")
-        if not isinstance(quick, bool):
-            raise TypeError("quick must be a boolean")
-        effect_args = ["tempo"]
-        if quick:
-            effect_args.append("-q")
-        if audio_type is not None:
-            effect_args.append(f"-{audio_type}")
-        effect_args.append(f"{factor:f}")
-        self.effects.extend(effect_args)
-        self.effects_log.append("tempo")
-        return self
-    def build(  # pyright: ignore [reportIncompatibleMethodOverride]
-        self,
-        input_filepath: str | Path | None = None,
-        output_filepath: str | Path | None = None,
-        input_array: np.ndarray | None = None,
-        sample_rate_in: float | None = None,
-        extra_args: list[str] | None = None,
-        return_output: bool = False,
-    ) -> tuple[bool, str | None, str | None]:
-        """Given an input file or array, creates an output_file on disk by
-        executing the current set of commands. This function returns True on
-        success. If return_output is True, this function returns a triple of
-        (status, out, err), giving the success state, along with stdout and
-        stderr returned by sox.
-        Parameters
-        ----------
-        input_filepath : str or None
-            Either path to input audio file or None for array input.
-        output_filepath : str
-            Path to desired output file. If a file already exists at
-            the given path, the file will be overwritten.
-            If '-n', no file is created.
-        input_array : np.ndarray or None
-            An np.ndarray of an waveform with shape (n_samples, n_channels).
-            sample_rate_in must also be provided.
-            If None, input_filepath must be specified.
-        sample_rate_in : int
-            Sample rate of input_array.
-            This argument is ignored if input_array is None.
-        extra_args : list or None, default=None
-            If a list is given, these additional arguments are passed to SoX
-            at the end of the list of effects.
-            Don't use this argument unless you know exactly what you're doing!
-        return_output : bool, default=False
-            If True, returns the status and information sent to stderr and
-            stdout as a tuple (status, stdout, stderr).
-            If output_filepath is None, return_output=True by default.
-            If False, returns True on success.
-        Returns
-        -------
-        status : bool
-            True on success.
-        out : str (optional)
-            This is not returned unless return_output is True.
-            When returned, captures the stdout produced by sox.
-        err : str (optional)
-            This is not returned unless return_output is True.
-            When returned, captures the stderr produced by sox.
-        Examples
-        --------
-        > import numpy as np
-        > import sox
-        > tfm = sox.Transformer()
-        > sample_rate = 44100
-        > y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
-        file in, file out - basic usage
-        > status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
-        file in, file out - equivalent usage
-        > status = tfm.build(
-                input_filepath='path/to/input.wav',
-                output_filepath='path/to/output.mp3'
-            )
-        array in, file out
-        > status = tfm.build(
-                input_array=y, sample_rate_in=sample_rate,
-                output_filepath='path/to/output.mp3'
-            )
-        """
-        from sox import file_info
-        from sox.core import SoxError
-        from sox.core import sox
-        from sox.log import logger
-        input_format, input_filepath = self._parse_inputs(input_filepath, input_array, sample_rate_in)
-        if output_filepath is None:
-            raise ValueError("output_filepath is not specified!")
-        # set output parameters
-        if input_filepath == output_filepath:
-            raise ValueError("input_filepath must be different from output_filepath.")
-        file_info.validate_output_file(output_filepath)
-        args = []
-        args.extend(self.globals)
-        args.extend(self._input_format_args(input_format))
-        args.append(input_filepath)
-        args.extend(self._output_format_args(self.output_format))
-        args.append(output_filepath)
-        args.extend(self.effects)
-        if extra_args is not None:
-            if not isinstance(extra_args, list):
-                raise ValueError("extra_args must be a list.")
-            args.extend(extra_args)
-        status, out, err = sox(args, input_array, True)
-        if status != 0:
-            raise SoxError(f"Stdout: {out}\nStderr: {err}")
-        logger.info("Created %s with effects: %s", output_filepath, " ".join(self.effects_log))
-        if return_output:
-            return status, out, err  # pyright: ignore [reportReturnType]
-        return True, None, None
-    def build_array(  # pyright: ignore [reportIncompatibleMethodOverride]
-        self,
-        input_filepath: str | Path | None = None,
-        input_array: np.ndarray | None = None,
-        sample_rate_in: int | None = None,
-        extra_args: list[str] | None = None,
-    ) -> np.ndarray:
-        """Given an input file or array, returns the output as a numpy array
-        by executing the current set of commands. By default, the array will
-        have the same sample rate as the input file unless otherwise specified
-        using set_output_format. Functions such as channels and convert
-        will be ignored!
-        The SonusAI override does not generate a warning for rate transforms.
-        Parameters
-        ----------
-        input_filepath : str, Path or None
-            Either path to input audio file or None.
-        input_array : np.ndarray or None
-            A np.ndarray of a waveform with shape (n_samples, n_channels).
-            If this argument is passed, sample_rate_in must also be provided.
-            If None, input_filepath must be specified.
-        sample_rate_in : int
-            Sample rate of input_array.
-            This argument is ignored if input_array is None.
-        extra_args : list or None, default=None
-            If a list is given, these additional arguments are passed to SoX
-            at the end of the list of effects.
-            Don't use this argument unless you know exactly what you're doing!
-        Returns
-        -------
-        output_array : np.ndarray
-            Output audio as a numpy array
-        Examples
-        --------
-        > import numpy as np
-        > import sox
-        > tfm = sox.Transformer()
-        > sample_rate = 44100
-        > y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
-        file in, array out
-        > output_array = tfm.build(input_filepath='path/to/input.wav')
-        array in, array out
-        > output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
-        specifying the output sample rate
-        > tfm.set_output_format(rate=8000)
-        > output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
-        if an effect changes the number of channels, you must explicitly
-        specify the number of output channels
-        > tfm.remix(remix_dictionary={1: [1], 2: [1], 3: [1]})
-        > tfm.set_output_format(channels=3)
-        > output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
-        """
-        from sox.core import SoxError
-        from sox.core import sox
-        from sox.log import logger
-        from sox.transform import ENCODINGS_MAPPING
-        input_format, input_filepath = self._parse_inputs(input_filepath, input_array, sample_rate_in)
-        # check if any of the below commands are part of the effects chain
-        ignored_commands = ["channels", "convert"]
-        if set(ignored_commands) & set(self.effects_log):
-            logger.warning(
-                "When outputting to an array, channels and convert "
-                + "effects may be ignored. Use set_output_format() to "
-                + "specify output formats."
-            )
-        output_filepath = "-"
-        if input_format.get("file_type") is None:
-            encoding_out = np.int16
-        else:
-            encoding_out = next(k for k, v in ENCODINGS_MAPPING.items() if input_format["file_type"] == v)
-        n_bits = np.dtype(encoding_out).itemsize * 8
-        output_format = {
-            "file_type": "raw",
-            "rate": sample_rate_in,
-            "bits": n_bits,
-            "channels": input_format["channels"],
-            "encoding": None,
-            "comments": None,
-            "append_comments": True,
-        }
-        if self.output_format.get("rate") is not None:
-            output_format["rate"] = self.output_format["rate"]
-        if self.output_format.get("channels") is not None:
-            output_format["channels"] = self.output_format["channels"]
-        if self.output_format.get("bits") is not None:
-            n_bits = self.output_format["bits"]
-            output_format["bits"] = n_bits
-        match n_bits:
-            case 8:
-                encoding_out = np.int8  # type: ignore[assignment]
-            case 16:
-                encoding_out = np.int16
-            case 32:
-                encoding_out = np.float32  # type: ignore[assignment]
-            case 64:
-                encoding_out = np.float64  # type: ignore[assignment]
-            case _:
-                raise ValueError(f"invalid n_bits {n_bits}")
-        args = []
-        args.extend(self.globals)
-        args.extend(self._input_format_args(input_format))
-        args.append(input_filepath)
-        args.extend(self._output_format_args(output_format))
-        args.append(output_filepath)
-        args.extend(self.effects)
-        if extra_args is not None:
-            if not isinstance(extra_args, list):
-                raise ValueError("extra_args must be a list.")
-            args.extend(extra_args)
-        status, out, err = sox(args, input_array, False)
-        if status != 0:
-            raise SoxError(f"Stdout: {out}\nStderr: {err}")
-        out = np.frombuffer(out, dtype=encoding_out)  # pyright: ignore [reportArgumentType, reportCallIssue]
-        if output_format["channels"] > 1:
-            out = out.reshape(
-                (output_format["channels"], int(len(out) / output_format["channels"])),
-                order="F",
-            ).T
-        logger.info("Created array with effects: %s", " ".join(self.effects_log))
-        return out

sonusai/mixture/sox_augmentation.py DELETED Viewed

@@ -1,136 +0,0 @@
-from sonusai.mixture.datatypes import AudioT
-from sonusai.mixture.datatypes import Augmentation
-from sonusai.mixture.datatypes import ImpulseResponseData
-def apply_augmentation(audio: AudioT, augmentation: Augmentation, frame_length: int = 1) -> AudioT:
-    """Apply augmentations to audio data using SoX
-    :param audio: Audio
-    :param augmentation: Augmentation
-    :param frame_length: Pad resulting audio to be a multiple of this
-    :return: Augmented audio
-    """
-    from .augmentation import pad_audio_to_frame
-    from .constants import BIT_DEPTH
-    from .constants import CHANNEL_COUNT
-    from .constants import ENCODING
-    from .constants import SAMPLE_RATE
-    from .sox_audio import Transformer
-    has_effects = False
-    try:
-        # Apply augmentations
-        tfm = Transformer()
-        tfm.set_input_format(rate=SAMPLE_RATE, bits=BIT_DEPTH, channels=CHANNEL_COUNT, encoding=ENCODING)
-        tfm.set_output_format(rate=SAMPLE_RATE, bits=BIT_DEPTH, channels=CHANNEL_COUNT, encoding=ENCODING)
-        # TODO
-        #  Always normalize and remove normalize from list of available augmentations
-        #  Normalize to globally set level (should this be a global config parameter,
-        #  or hard-coded into the script?)
-        if augmentation.normalize is not None:
-            tfm.norm(db_level=augmentation.normalize)
-            has_effects = True
-        if augmentation.gain is not None:
-            tfm.gain(gain_db=augmentation.gain, normalize=False)
-            has_effects = True
-        if augmentation.pitch is not None:
-            tfm.pitch(n_semitones=float(augmentation.pitch) / 100)
-            tfm.rate(samplerate=SAMPLE_RATE)
-            has_effects = True
-        if augmentation.tempo is not None:
-            tfm.tempo(factor=float(augmentation.tempo), audio_type="s")
-            has_effects = True
-        if augmentation.eq1 is not None:
-            tfm.equalizer(*augmentation.eq1)
-            has_effects = True
-        if augmentation.eq2 is not None:
-            tfm.equalizer(*augmentation.eq2)
-            has_effects = True
-        if augmentation.eq3 is not None:
-            tfm.equalizer(*augmentation.eq3)
-            has_effects = True
-        if augmentation.lpf is not None:
-            tfm.lowpass(frequency=augmentation.lpf)
-            has_effects = True
-        if has_effects:
-            audio_out = tfm.build_array(input_array=audio, sample_rate_in=SAMPLE_RATE)
-        else:
-            audio_out = audio
-    except Exception as e:
-        raise RuntimeError(f"Error applying {augmentation}: {e}") from e
-    # make sure length is multiple of frame_length
-    return pad_audio_to_frame(audio=audio_out, frame_length=frame_length)
-def apply_impulse_response(audio: AudioT, ir: ImpulseResponseData) -> AudioT:
-    """Apply impulse response to audio data using SoX
-    :param audio: Audio
-    :param ir: Impulse response data
-    :return: Augmented audio
-    """
-    import math
-    import tempfile
-    from pathlib import Path
-    import numpy as np
-    from sonusai.utils import linear_to_db
-    from .constants import SAMPLE_RATE
-    from .sox_audio import Transformer
-    # Early exit if no ir or if all audio is zero
-    if ir is None or not audio.any():
-        return audio
-    # Get current maximum level in dB
-    max_db = linear_to_db(max(abs(audio)))
-    # Convert audio to IR sample rate
-    tfm = Transformer()
-    tfm.set_output_format(rate=ir.sample_rate)
-    audio_out = tfm.build_array(input_array=audio, sample_rate_in=SAMPLE_RATE)
-    # Pad audio to align with original and give enough room for IR tail
-    pad = math.ceil(ir.length / 2)
-    audio_out = np.pad(array=audio_out, pad_width=(pad, pad))
-    # Write coefficients to temporary file
-    temp = tempfile.NamedTemporaryFile(mode="w+t")
-    for d in ir.data:
-        temp.write(f"{d:f}\n")
-    temp.seek(0)
-    # Apply IR and convert back to global sample rate
-    tfm = Transformer()
-    tfm.set_output_format(rate=SAMPLE_RATE)
-    tfm.fir(coefficients=temp.name)  # pyright: ignore [reportArgumentType]
-    try:
-        audio_out = tfm.build_array(input_array=audio_out, sample_rate_in=ir.sample_rate)
-    except Exception as e:
-        raise RuntimeError(f"Error applying IR: {e}") from e
-    path = Path(temp.name)
-    temp.close()
-    path.unlink()
-    # Reset level to previous max value
-    tfm = Transformer()
-    tfm.norm(db_level=max_db)
-    audio_out = tfm.build_array(input_array=audio_out, sample_rate_in=SAMPLE_RATE)
-    return audio_out[: len(audio)]

sonusai 0.19.9__py3-none-any.whl → 0.20.2__py3-none-any.whl

sonusai 0.19.9py3-none-any.whl → 0.20.2py3-none-any.whl