sonusai 0.19.9__py3-none-any.whl → 0.20.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/calc_metric_spenh.py +265 -233
- sonusai/data/genmixdb.yml +4 -2
- sonusai/data/silero_vad_v5.1.jit +0 -0
- sonusai/data/silero_vad_v5.1.onnx +0 -0
- sonusai/doc/doc.py +14 -0
- sonusai/genft.py +1 -1
- sonusai/genmetrics.py +15 -18
- sonusai/genmix.py +1 -1
- sonusai/genmixdb.py +30 -52
- sonusai/ir_metric.py +555 -0
- sonusai/metrics_summary.py +322 -0
- sonusai/mixture/__init__.py +6 -2
- sonusai/mixture/audio.py +139 -15
- sonusai/mixture/augmentation.py +199 -84
- sonusai/mixture/config.py +9 -4
- sonusai/mixture/constants.py +0 -1
- sonusai/mixture/datatypes.py +19 -10
- sonusai/mixture/generation.py +52 -64
- sonusai/mixture/helpers.py +38 -26
- sonusai/mixture/ir_delay.py +63 -0
- sonusai/mixture/mixdb.py +190 -46
- sonusai/mixture/targets.py +3 -6
- sonusai/mixture/truth_functions/energy.py +9 -5
- sonusai/mixture/truth_functions/metrics.py +1 -1
- sonusai/mkwav.py +1 -1
- sonusai/onnx_predict.py +1 -1
- sonusai/queries/queries.py +1 -1
- sonusai/utils/__init__.py +2 -0
- sonusai/utils/asr.py +1 -1
- sonusai/utils/load_object.py +8 -2
- sonusai/utils/stratified_shuffle_split.py +1 -1
- sonusai/utils/temp_seed.py +13 -0
- {sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/METADATA +2 -2
- {sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/RECORD +36 -35
- {sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/WHEEL +1 -1
- sonusai/mixture/soundfile_audio.py +0 -130
- sonusai/mixture/sox_audio.py +0 -476
- sonusai/mixture/sox_augmentation.py +0 -136
- sonusai/mixture/torchaudio_audio.py +0 -106
- sonusai/mixture/torchaudio_augmentation.py +0 -109
- {sonusai-0.19.9.dist-info → sonusai-0.20.2.dist-info}/entry_points.txt +0 -0
@@ -1,106 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
|
-
from sonusai.mixture.datatypes import AudioT
|
4
|
-
from sonusai.mixture.datatypes import ImpulseResponseData
|
5
|
-
|
6
|
-
|
7
|
-
def read_impulse_response(
|
8
|
-
name: str | Path,
|
9
|
-
delay_compensation: bool = True,
|
10
|
-
normalize: bool = True,
|
11
|
-
) -> ImpulseResponseData:
|
12
|
-
"""Read impulse response data using torchaudio
|
13
|
-
|
14
|
-
:param name: File name
|
15
|
-
:param delay_compensation: Apply delay compensation
|
16
|
-
:param normalize: Apply normalization
|
17
|
-
:return: ImpulseResponseData object
|
18
|
-
"""
|
19
|
-
import numpy as np
|
20
|
-
import torch
|
21
|
-
import torchaudio
|
22
|
-
|
23
|
-
from .tokenized_shell_vars import tokenized_expand
|
24
|
-
|
25
|
-
expanded_name, _ = tokenized_expand(name)
|
26
|
-
|
27
|
-
# Read impulse response data from audio file
|
28
|
-
try:
|
29
|
-
raw, sample_rate = torchaudio.load(expanded_name, backend="soundfile")
|
30
|
-
except Exception as e:
|
31
|
-
if name != expanded_name:
|
32
|
-
raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
|
33
|
-
else:
|
34
|
-
raise OSError(f"Error reading {name}: {e}") from e
|
35
|
-
|
36
|
-
raw = torch.squeeze(raw[0, :])
|
37
|
-
|
38
|
-
if delay_compensation:
|
39
|
-
offset = torch.argmax(raw)
|
40
|
-
raw = raw[offset:]
|
41
|
-
|
42
|
-
data = np.array(raw).astype(np.float32)
|
43
|
-
|
44
|
-
if normalize:
|
45
|
-
# Inexplicably,
|
46
|
-
# data = data / torch.linalg.vector_norm(data)
|
47
|
-
# causes multiprocessing contexts to hang.
|
48
|
-
# Use np.linalg.norm() instead.
|
49
|
-
data = data / np.linalg.norm(data)
|
50
|
-
|
51
|
-
return ImpulseResponseData(name=str(name), sample_rate=sample_rate, data=data)
|
52
|
-
|
53
|
-
|
54
|
-
def get_sample_rate(name: str | Path) -> int:
|
55
|
-
"""Get sample rate from audio file using torchaudio
|
56
|
-
|
57
|
-
:param name: File name
|
58
|
-
:return: Sample rate
|
59
|
-
"""
|
60
|
-
import torchaudio
|
61
|
-
|
62
|
-
from .tokenized_shell_vars import tokenized_expand
|
63
|
-
|
64
|
-
expanded_name, _ = tokenized_expand(name)
|
65
|
-
|
66
|
-
try:
|
67
|
-
return torchaudio.info(expanded_name).sample_rate
|
68
|
-
except Exception as e:
|
69
|
-
if name != expanded_name:
|
70
|
-
raise OSError(f"Error reading {name} (expanded: {expanded_name}):\n{e}") from e
|
71
|
-
else:
|
72
|
-
raise OSError(f"Error reading {name}:\n{e}") from e
|
73
|
-
|
74
|
-
|
75
|
-
def read_audio(name: str | Path) -> AudioT:
|
76
|
-
"""Read audio data from a file using torchaudio
|
77
|
-
|
78
|
-
:param name: File name
|
79
|
-
:return: Array of time domain audio data
|
80
|
-
"""
|
81
|
-
import numpy as np
|
82
|
-
import torch
|
83
|
-
import torchaudio
|
84
|
-
|
85
|
-
from .constants import SAMPLE_RATE
|
86
|
-
from .tokenized_shell_vars import tokenized_expand
|
87
|
-
|
88
|
-
expanded_name, _ = tokenized_expand(name)
|
89
|
-
|
90
|
-
try:
|
91
|
-
out, samplerate = torchaudio.load(expanded_name, backend="soundfile")
|
92
|
-
out = torch.reshape(out[0, :], (1, out.size()[1]))
|
93
|
-
out = torchaudio.functional.resample(
|
94
|
-
out,
|
95
|
-
orig_freq=samplerate,
|
96
|
-
new_freq=SAMPLE_RATE,
|
97
|
-
resampling_method="sinc_interp_hann",
|
98
|
-
)
|
99
|
-
except Exception as e:
|
100
|
-
if name != expanded_name:
|
101
|
-
raise OSError(f"Error reading {name} (expanded: {expanded_name}):\n{e}") from e
|
102
|
-
else:
|
103
|
-
raise OSError(f"Error reading {name}:\n{e}") from e
|
104
|
-
|
105
|
-
result = np.squeeze(np.array(out))
|
106
|
-
return result
|
@@ -1,109 +0,0 @@
|
|
1
|
-
from sonusai.mixture.datatypes import AudioT
|
2
|
-
from sonusai.mixture.datatypes import Augmentation
|
3
|
-
from sonusai.mixture.datatypes import ImpulseResponseData
|
4
|
-
|
5
|
-
|
6
|
-
def apply_augmentation(audio: AudioT, augmentation: Augmentation, frame_length: int = 1) -> AudioT:
|
7
|
-
"""Apply augmentations to audio data using torchaudio.sox_effects
|
8
|
-
|
9
|
-
:param audio: Audio
|
10
|
-
:param augmentation: Augmentation
|
11
|
-
:param frame_length: Pad resulting audio to be a multiple of this
|
12
|
-
:return: Augmented audio
|
13
|
-
"""
|
14
|
-
import numpy as np
|
15
|
-
import torch
|
16
|
-
import torchaudio
|
17
|
-
|
18
|
-
from .augmentation import pad_audio_to_frame
|
19
|
-
from .constants import SAMPLE_RATE
|
20
|
-
|
21
|
-
effects: list[list[str]] = []
|
22
|
-
|
23
|
-
# TODO: Always normalize and remove normalize from list of available augmentations
|
24
|
-
# Normalize to globally set level (should this be a global config parameter, or hard-coded into the script?)
|
25
|
-
# TODO: Support all sox effects supported by torchaudio (torchaudio.sox_effects.effect_names())
|
26
|
-
if augmentation.normalize is not None:
|
27
|
-
effects.append(["norm", str(augmentation.normalize)])
|
28
|
-
|
29
|
-
if augmentation.gain is not None:
|
30
|
-
effects.append(["gain", str(augmentation.gain)])
|
31
|
-
|
32
|
-
if augmentation.pitch is not None:
|
33
|
-
effects.append(["pitch", str(augmentation.pitch)])
|
34
|
-
effects.append(["rate", str(SAMPLE_RATE)])
|
35
|
-
|
36
|
-
if augmentation.tempo is not None:
|
37
|
-
effects.append(["tempo", "-s", str(augmentation.tempo)])
|
38
|
-
|
39
|
-
if augmentation.eq1 is not None:
|
40
|
-
effects.append(["equalizer", *[str(item) for item in augmentation.eq1]])
|
41
|
-
|
42
|
-
if augmentation.eq2 is not None:
|
43
|
-
effects.append(["equalizer", *[str(item) for item in augmentation.eq2]])
|
44
|
-
|
45
|
-
if augmentation.eq3 is not None:
|
46
|
-
effects.append(["equalizer", *[str(item) for item in augmentation.eq3]])
|
47
|
-
|
48
|
-
if augmentation.lpf is not None:
|
49
|
-
effects.append(["lowpass", "-2", str(augmentation.lpf), "0.707"])
|
50
|
-
|
51
|
-
if effects:
|
52
|
-
if audio.ndim == 1:
|
53
|
-
audio = np.reshape(audio, (1, audio.shape[0]))
|
54
|
-
out = torch.tensor(audio)
|
55
|
-
|
56
|
-
try:
|
57
|
-
out, _ = torchaudio.sox_effects.apply_effects_tensor(out, sample_rate=SAMPLE_RATE, effects=effects)
|
58
|
-
except Exception as e:
|
59
|
-
raise RuntimeError(f"Error applying {augmentation}: {e}") from e
|
60
|
-
|
61
|
-
audio_out = np.squeeze(np.array(out))
|
62
|
-
else:
|
63
|
-
audio_out = audio
|
64
|
-
|
65
|
-
# make sure length is multiple of frame_length
|
66
|
-
return pad_audio_to_frame(audio=audio_out, frame_length=frame_length)
|
67
|
-
|
68
|
-
|
69
|
-
def apply_impulse_response(audio: AudioT, ir: ImpulseResponseData) -> AudioT:
|
70
|
-
"""Apply impulse response to audio data using torchaudio.fftconvolve
|
71
|
-
|
72
|
-
:param audio: Audio
|
73
|
-
:param ir: Impulse response data
|
74
|
-
:return: Augmented audio
|
75
|
-
"""
|
76
|
-
import numpy as np
|
77
|
-
import torch
|
78
|
-
import torchaudio
|
79
|
-
|
80
|
-
from sonusai.utils import linear_to_db
|
81
|
-
|
82
|
-
from .constants import SAMPLE_RATE
|
83
|
-
|
84
|
-
# Early exit if no ir or if all audio is zero
|
85
|
-
if ir is None or not audio.any():
|
86
|
-
return audio
|
87
|
-
|
88
|
-
# Get current maximum level in dB
|
89
|
-
max_db = linear_to_db(max(abs(audio)))
|
90
|
-
|
91
|
-
# Convert audio to IR sample rate
|
92
|
-
audio_in = torch.reshape(torch.tensor(audio), (1, len(audio)))
|
93
|
-
audio_out, sr = torchaudio.sox_effects.apply_effects_tensor(
|
94
|
-
audio_in, sample_rate=SAMPLE_RATE, effects=[["rate", str(ir.sample_rate)]]
|
95
|
-
)
|
96
|
-
|
97
|
-
# Apply IR and convert back to global sample rate
|
98
|
-
rir = torch.reshape(torch.tensor(ir.data), (1, len(ir.data)))
|
99
|
-
audio_out = torchaudio.functional.fftconvolve(audio_out, rir)
|
100
|
-
audio_out, sr = torchaudio.sox_effects.apply_effects_tensor(
|
101
|
-
audio_out, sample_rate=ir.sample_rate, effects=[["rate", str(SAMPLE_RATE)]]
|
102
|
-
)
|
103
|
-
|
104
|
-
# Reset level to previous max value
|
105
|
-
audio_out, sr = torchaudio.sox_effects.apply_effects_tensor(
|
106
|
-
audio_out, sample_rate=SAMPLE_RATE, effects=[["norm", str(max_db)]]
|
107
|
-
)
|
108
|
-
|
109
|
-
return np.squeeze(np.array(audio_out[:, : len(audio)]))
|
File without changes
|