sonusai 0.18.8__py3-none-any.whl → 0.19.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +20 -29
- sonusai/aawscd_probwrite.py +18 -18
- sonusai/audiofe.py +93 -80
- sonusai/calc_metric_spenh.py +395 -321
- sonusai/data/genmixdb.yml +5 -11
- sonusai/{gentcst.py → deprecated/gentcst.py} +146 -149
- sonusai/{plot.py → deprecated/plot.py} +177 -131
- sonusai/{tplot.py → deprecated/tplot.py} +124 -102
- sonusai/doc/__init__.py +1 -1
- sonusai/doc/doc.py +112 -177
- sonusai/doc.py +10 -10
- sonusai/genft.py +93 -77
- sonusai/genmetrics.py +59 -46
- sonusai/genmix.py +116 -104
- sonusai/genmixdb.py +194 -153
- sonusai/lsdb.py +56 -66
- sonusai/main.py +23 -20
- sonusai/metrics/__init__.py +2 -0
- sonusai/metrics/calc_audio_stats.py +29 -24
- sonusai/metrics/calc_class_weights.py +7 -7
- sonusai/metrics/calc_optimal_thresholds.py +5 -7
- sonusai/metrics/calc_pcm.py +3 -3
- sonusai/metrics/calc_pesq.py +10 -7
- sonusai/metrics/calc_phase_distance.py +3 -3
- sonusai/metrics/calc_sa_sdr.py +10 -8
- sonusai/metrics/calc_segsnr_f.py +15 -17
- sonusai/metrics/calc_speech.py +105 -47
- sonusai/metrics/calc_wer.py +35 -32
- sonusai/metrics/calc_wsdr.py +10 -7
- sonusai/metrics/class_summary.py +30 -27
- sonusai/metrics/confusion_matrix_summary.py +25 -22
- sonusai/metrics/one_hot.py +91 -57
- sonusai/metrics/snr_summary.py +53 -46
- sonusai/mixture/__init__.py +19 -14
- sonusai/mixture/audio.py +4 -6
- sonusai/mixture/augmentation.py +37 -43
- sonusai/mixture/class_count.py +5 -14
- sonusai/mixture/config.py +292 -225
- sonusai/mixture/constants.py +41 -30
- sonusai/mixture/data_io.py +155 -0
- sonusai/mixture/datatypes.py +111 -108
- sonusai/mixture/db_datatypes.py +54 -70
- sonusai/mixture/eq_rule_is_valid.py +6 -9
- sonusai/mixture/feature.py +50 -46
- sonusai/mixture/generation.py +522 -389
- sonusai/mixture/helpers.py +217 -272
- sonusai/mixture/log_duration_and_sizes.py +16 -13
- sonusai/mixture/mixdb.py +677 -473
- sonusai/mixture/soundfile_audio.py +12 -17
- sonusai/mixture/sox_audio.py +91 -112
- sonusai/mixture/sox_augmentation.py +8 -9
- sonusai/mixture/spectral_mask.py +4 -6
- sonusai/mixture/target_class_balancing.py +41 -36
- sonusai/mixture/targets.py +69 -67
- sonusai/mixture/tokenized_shell_vars.py +23 -23
- sonusai/mixture/torchaudio_audio.py +14 -15
- sonusai/mixture/torchaudio_augmentation.py +23 -27
- sonusai/mixture/truth.py +48 -26
- sonusai/mixture/truth_functions/__init__.py +26 -0
- sonusai/mixture/truth_functions/crm.py +56 -38
- sonusai/mixture/truth_functions/datatypes.py +37 -0
- sonusai/mixture/truth_functions/energy.py +85 -59
- sonusai/mixture/truth_functions/file.py +30 -30
- sonusai/mixture/truth_functions/phoneme.py +14 -7
- sonusai/mixture/truth_functions/sed.py +71 -45
- sonusai/mixture/truth_functions/target.py +69 -106
- sonusai/mkwav.py +52 -85
- sonusai/onnx_predict.py +46 -43
- sonusai/queries/__init__.py +3 -1
- sonusai/queries/queries.py +100 -59
- sonusai/speech/__init__.py +2 -0
- sonusai/speech/l2arctic.py +24 -23
- sonusai/speech/librispeech.py +16 -17
- sonusai/speech/mcgill.py +22 -21
- sonusai/speech/textgrid.py +32 -25
- sonusai/speech/timit.py +45 -42
- sonusai/speech/vctk.py +14 -13
- sonusai/speech/voxceleb.py +26 -20
- sonusai/summarize_metric_spenh.py +11 -10
- sonusai/utils/__init__.py +4 -3
- sonusai/utils/asl_p56.py +1 -1
- sonusai/utils/asr.py +37 -17
- sonusai/utils/asr_functions/__init__.py +2 -0
- sonusai/utils/asr_functions/aaware_whisper.py +18 -12
- sonusai/utils/audio_devices.py +12 -12
- sonusai/utils/braced_glob.py +6 -8
- sonusai/utils/calculate_input_shape.py +1 -4
- sonusai/utils/compress.py +2 -2
- sonusai/utils/convert_string_to_number.py +1 -3
- sonusai/utils/create_timestamp.py +1 -1
- sonusai/utils/create_ts_name.py +2 -2
- sonusai/utils/dataclass_from_dict.py +1 -1
- sonusai/utils/docstring.py +6 -6
- sonusai/utils/energy_f.py +9 -7
- sonusai/utils/engineering_number.py +56 -54
- sonusai/utils/get_label_names.py +8 -10
- sonusai/utils/human_readable_size.py +2 -2
- sonusai/utils/model_utils.py +3 -5
- sonusai/utils/numeric_conversion.py +2 -4
- sonusai/utils/onnx_utils.py +43 -32
- sonusai/utils/parallel.py +40 -27
- sonusai/utils/print_mixture_details.py +25 -22
- sonusai/utils/ranges.py +12 -12
- sonusai/utils/read_predict_data.py +11 -9
- sonusai/utils/reshape.py +19 -26
- sonusai/utils/seconds_to_hms.py +1 -1
- sonusai/utils/stacked_complex.py +8 -16
- sonusai/utils/stratified_shuffle_split.py +29 -27
- sonusai/utils/write_audio.py +2 -2
- sonusai/utils/yes_or_no.py +3 -3
- sonusai/vars.py +14 -14
- {sonusai-0.18.8.dist-info → sonusai-0.19.5.dist-info}/METADATA +20 -21
- sonusai-0.19.5.dist-info/RECORD +125 -0
- {sonusai-0.18.8.dist-info → sonusai-0.19.5.dist-info}/WHEEL +1 -1
- sonusai/mixture/truth_functions/data.py +0 -58
- sonusai/utils/read_mixture_data.py +0 -14
- sonusai-0.18.8.dist-info/RECORD +0 -125
- {sonusai-0.18.8.dist-info → sonusai-0.19.5.dist-info}/entry_points.txt +0 -0
@@ -9,29 +9,28 @@ def _raw_read(name: str | Path) -> tuple[AudioT, int]:
|
|
9
9
|
import soundfile
|
10
10
|
from pydub import AudioSegment
|
11
11
|
|
12
|
-
from sonusai import SonusAIError
|
13
12
|
from .tokenized_shell_vars import tokenized_expand
|
14
13
|
|
15
14
|
expanded_name, _ = tokenized_expand(name)
|
16
15
|
|
17
16
|
try:
|
18
|
-
if expanded_name.endswith(
|
17
|
+
if expanded_name.endswith(".mp3"):
|
19
18
|
sound = AudioSegment.from_mp3(expanded_name)
|
20
19
|
raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
|
21
20
|
raw = raw / 2 ** (sound.sample_width * 8 - 1)
|
22
21
|
sample_rate = sound.frame_rate
|
23
|
-
elif expanded_name.endswith(
|
22
|
+
elif expanded_name.endswith(".m4a"):
|
24
23
|
sound = AudioSegment.from_file(expanded_name)
|
25
24
|
raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
|
26
25
|
raw = raw / 2 ** (sound.sample_width * 8 - 1)
|
27
26
|
sample_rate = sound.frame_rate
|
28
27
|
else:
|
29
|
-
raw, sample_rate = soundfile.read(expanded_name, always_2d=True, dtype=
|
28
|
+
raw, sample_rate = soundfile.read(expanded_name, always_2d=True, dtype="float32")
|
30
29
|
except Exception as e:
|
31
30
|
if name != expanded_name:
|
32
|
-
raise
|
31
|
+
raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
|
33
32
|
else:
|
34
|
-
raise
|
33
|
+
raise OSError(f"Error reading {name}: {e}") from e
|
35
34
|
|
36
35
|
return np.squeeze(raw[:, 0]), sample_rate
|
37
36
|
|
@@ -45,24 +44,23 @@ def get_sample_rate(name: str | Path) -> int:
|
|
45
44
|
import soundfile
|
46
45
|
from pydub import AudioSegment
|
47
46
|
|
48
|
-
from sonusai import SonusAIError
|
49
47
|
from .tokenized_shell_vars import tokenized_expand
|
50
48
|
|
51
49
|
expanded_name, _ = tokenized_expand(name)
|
52
50
|
|
53
51
|
try:
|
54
|
-
if expanded_name.endswith(
|
52
|
+
if expanded_name.endswith(".mp3"):
|
55
53
|
return AudioSegment.from_mp3(expanded_name).frame_rate
|
56
54
|
|
57
|
-
if expanded_name.endswith(
|
55
|
+
if expanded_name.endswith(".m4a"):
|
58
56
|
return AudioSegment.from_file(expanded_name).frame_rate
|
59
57
|
|
60
58
|
return soundfile.info(expanded_name).samplerate
|
61
59
|
except Exception as e:
|
62
60
|
if name != expanded_name:
|
63
|
-
raise
|
61
|
+
raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
|
64
62
|
else:
|
65
|
-
raise
|
63
|
+
raise OSError(f"Error reading {name}: {e}") from e
|
66
64
|
|
67
65
|
|
68
66
|
def read_ir(name: str | Path) -> ImpulseResponseData:
|
@@ -95,10 +93,7 @@ def read_audio(name: str | Path) -> AudioT:
|
|
95
93
|
from .constants import SAMPLE_RATE
|
96
94
|
|
97
95
|
out, sample_rate = _raw_read(name)
|
98
|
-
out = librosa.resample(out,
|
99
|
-
orig_sr=sample_rate,
|
100
|
-
target_sr=SAMPLE_RATE,
|
101
|
-
res_type='soxr_hq')
|
96
|
+
out = librosa.resample(out, orig_sr=sample_rate, target_sr=SAMPLE_RATE, res_type="soxr_hq")
|
102
97
|
|
103
98
|
return out
|
104
99
|
|
@@ -119,11 +114,11 @@ def get_num_samples(name: str | Path) -> int:
|
|
119
114
|
|
120
115
|
expanded_name, _ = tokenized_expand(name)
|
121
116
|
|
122
|
-
if expanded_name.endswith(
|
117
|
+
if expanded_name.endswith(".mp3"):
|
123
118
|
sound = AudioSegment.from_mp3(expanded_name)
|
124
119
|
samples = sound.frame_count()
|
125
120
|
sample_rate = sound.frame_rate
|
126
|
-
elif expanded_name.endswith(
|
121
|
+
elif expanded_name.endswith(".m4a"):
|
127
122
|
sound = AudioSegment.from_file(expanded_name)
|
128
123
|
samples = sound.frame_count()
|
129
124
|
sample_rate = sound.frame_rate
|
sonusai/mixture/sox_audio.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Optional
|
3
2
|
|
4
3
|
import numpy as np
|
5
4
|
from sox import Transformer as SoxTransformer
|
@@ -16,7 +15,6 @@ def read_impulse_response(name: str | Path) -> ImpulseResponseData:
|
|
16
15
|
"""
|
17
16
|
from scipy.io import wavfile
|
18
17
|
|
19
|
-
from sonusai import SonusAIError
|
20
18
|
from .datatypes import ImpulseResponseData
|
21
19
|
from .tokenized_shell_vars import tokenized_expand
|
22
20
|
|
@@ -27,9 +25,9 @@ def read_impulse_response(name: str | Path) -> ImpulseResponseData:
|
|
27
25
|
sample_rate, data = wavfile.read(expanded_name)
|
28
26
|
except Exception as e:
|
29
27
|
if name != expanded_name:
|
30
|
-
raise
|
28
|
+
raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
|
31
29
|
else:
|
32
|
-
raise
|
30
|
+
raise OSError(f"Error reading {name}: {e}") from e
|
33
31
|
|
34
32
|
data = data.astype(np.float32)
|
35
33
|
offset = np.argmax(data)
|
@@ -49,7 +47,6 @@ def read_audio(name: str | Path) -> AudioT:
|
|
49
47
|
|
50
48
|
from sox.core import sox
|
51
49
|
|
52
|
-
from sonusai import SonusAIError
|
53
50
|
from .constants import BIT_DEPTH
|
54
51
|
from .constants import CHANNEL_COUNT
|
55
52
|
from .constants import ENCODING
|
@@ -57,7 +54,6 @@ def read_audio(name: str | Path) -> AudioT:
|
|
57
54
|
from .tokenized_shell_vars import tokenized_expand
|
58
55
|
|
59
56
|
def encode_output(buffer: Any) -> np.ndarray:
|
60
|
-
from sonusai import SonusAIError
|
61
57
|
from .constants import BIT_DEPTH
|
62
58
|
from .constants import ENCODING
|
63
59
|
|
@@ -71,14 +67,14 @@ def read_audio(name: str | Path) -> AudioT:
|
|
71
67
|
return np.frombuffer(buffer, dtype=np.int32)
|
72
68
|
|
73
69
|
if BIT_DEPTH == 32:
|
74
|
-
if ENCODING ==
|
70
|
+
if ENCODING == "floating-point":
|
75
71
|
return np.frombuffer(buffer, dtype=np.float32)
|
76
72
|
return np.frombuffer(buffer, dtype=np.int32)
|
77
73
|
|
78
74
|
if BIT_DEPTH == 64:
|
79
75
|
return np.frombuffer(buffer, dtype=np.float64)
|
80
76
|
|
81
|
-
raise
|
77
|
+
raise ValueError(f"Invalid BIT_DEPTH {BIT_DEPTH}")
|
82
78
|
|
83
79
|
expanded_name, _ = tokenized_expand(name)
|
84
80
|
|
@@ -86,36 +82,41 @@ def read_audio(name: str | Path) -> AudioT:
|
|
86
82
|
# Read in and convert to desired format
|
87
83
|
# NOTE: pysox format transformations do not handle encoding properly; need to use direct call to sox instead
|
88
84
|
args = [
|
89
|
-
|
90
|
-
|
85
|
+
"-D",
|
86
|
+
"-G",
|
91
87
|
expanded_name,
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
88
|
+
"-t",
|
89
|
+
"raw",
|
90
|
+
"-r",
|
91
|
+
str(SAMPLE_RATE),
|
92
|
+
"-b",
|
93
|
+
str(BIT_DEPTH),
|
94
|
+
"-c",
|
95
|
+
str(CHANNEL_COUNT),
|
96
|
+
"-e",
|
97
|
+
ENCODING,
|
98
|
+
"-",
|
99
|
+
"remix",
|
100
|
+
"1",
|
99
101
|
]
|
100
102
|
status, out, err = sox(args, None, False)
|
101
103
|
if status != 0:
|
102
|
-
raise
|
104
|
+
raise RuntimeError(f"sox stdout: {out}\nsox stderr: {err}") # noqa: TRY301
|
103
105
|
|
104
106
|
return encode_output(out)
|
105
107
|
|
106
108
|
except Exception as e:
|
107
109
|
if name != expanded_name:
|
108
|
-
raise
|
110
|
+
raise OSError(f"Error reading {name} (expanded: {expanded_name}):\n{e}") from e
|
109
111
|
else:
|
110
|
-
raise
|
112
|
+
raise OSError(f"Error reading {name}:\n{e}") from e
|
111
113
|
|
112
114
|
|
113
115
|
class Transformer(SoxTransformer):
|
114
|
-
"""Override certain sox.Transformer methods
|
115
|
-
"""
|
116
|
+
"""Override certain sox.Transformer methods"""
|
116
117
|
|
117
118
|
def fir(self, coefficients):
|
118
|
-
"""Use SoX
|
119
|
+
"""Use SoX's FFT convolution engine with given FIR filter coefficients.
|
119
120
|
|
120
121
|
The SonusAI override allows coefficients to be either a list of numbers
|
121
122
|
or a string containing a text file with the coefficients.
|
@@ -128,22 +129,20 @@ class Transformer(SoxTransformer):
|
|
128
129
|
"""
|
129
130
|
from sox.core import is_number
|
130
131
|
|
131
|
-
from sonusai import SonusAIError
|
132
|
-
|
133
132
|
if not isinstance(coefficients, list) and not isinstance(coefficients, str):
|
134
|
-
raise
|
133
|
+
raise TypeError("coefficients must be a list or a str.")
|
135
134
|
|
136
|
-
if isinstance(coefficients, list) and not all(
|
137
|
-
raise
|
135
|
+
if isinstance(coefficients, list) and not all(is_number(c) for c in coefficients):
|
136
|
+
raise TypeError("coefficients list must be numbers.")
|
138
137
|
|
139
|
-
effect_args = [
|
138
|
+
effect_args = ["fir"]
|
140
139
|
if isinstance(coefficients, list):
|
141
|
-
effect_args.extend([
|
140
|
+
effect_args.extend([f"{c:f}" for c in coefficients])
|
142
141
|
else:
|
143
142
|
effect_args.append(coefficients)
|
144
143
|
|
145
144
|
self.effects.extend(effect_args)
|
146
|
-
self.effects_log.append(
|
145
|
+
self.effects_log.append("fir")
|
147
146
|
|
148
147
|
return self
|
149
148
|
|
@@ -181,42 +180,42 @@ class Transformer(SoxTransformer):
|
|
181
180
|
from sox.core import is_number
|
182
181
|
from sox.log import logger
|
183
182
|
|
184
|
-
from sonusai import SonusAIError
|
185
|
-
|
186
183
|
if not is_number(factor) or factor <= 0:
|
187
|
-
raise
|
184
|
+
raise ValueError("factor must be a positive number")
|
188
185
|
|
189
186
|
if factor < 0.5 or factor > 2:
|
190
|
-
logger.warning(
|
187
|
+
logger.warning("Using an extreme time stretching factor. Quality of results will be poor")
|
191
188
|
|
192
|
-
if audio_type not in [None,
|
193
|
-
raise
|
189
|
+
if audio_type not in [None, "m", "s", "l"]:
|
190
|
+
raise ValueError("audio_type must be one of None, 'm', 's', or 'l'.")
|
194
191
|
|
195
192
|
if not isinstance(quick, bool):
|
196
|
-
raise
|
193
|
+
raise TypeError("quick must be a boolean")
|
197
194
|
|
198
|
-
effect_args = [
|
195
|
+
effect_args = ["tempo"]
|
199
196
|
|
200
197
|
if quick:
|
201
|
-
effect_args.append(
|
198
|
+
effect_args.append("-q")
|
202
199
|
|
203
200
|
if audio_type is not None:
|
204
|
-
effect_args.append(
|
201
|
+
effect_args.append(f"-{audio_type}")
|
205
202
|
|
206
|
-
effect_args.append(
|
203
|
+
effect_args.append(f"{factor:f}")
|
207
204
|
|
208
205
|
self.effects.extend(effect_args)
|
209
|
-
self.effects_log.append(
|
206
|
+
self.effects_log.append("tempo")
|
210
207
|
|
211
208
|
return self
|
212
209
|
|
213
|
-
def build(
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
210
|
+
def build(
|
211
|
+
self,
|
212
|
+
input_filepath: str | Path | None = None,
|
213
|
+
output_filepath: str | Path | None = None,
|
214
|
+
input_array: np.ndarray | None = None,
|
215
|
+
sample_rate_in: float | None = None,
|
216
|
+
extra_args: list[str] | None = None,
|
217
|
+
return_output: bool = False,
|
218
|
+
) -> tuple[bool, str | None, str | None]:
|
220
219
|
"""Given an input file or array, creates an output_file on disk by
|
221
220
|
executing the current set of commands. This function returns True on
|
222
221
|
success. If return_output is True, this function returns a triple of
|
@@ -291,18 +290,14 @@ class Transformer(SoxTransformer):
|
|
291
290
|
from sox.core import sox
|
292
291
|
from sox.log import logger
|
293
292
|
|
294
|
-
input_format, input_filepath = self._parse_inputs(
|
295
|
-
input_filepath, input_array, sample_rate_in
|
296
|
-
)
|
293
|
+
input_format, input_filepath = self._parse_inputs(input_filepath, input_array, sample_rate_in)
|
297
294
|
|
298
295
|
if output_filepath is None:
|
299
296
|
raise ValueError("output_filepath is not specified!")
|
300
297
|
|
301
298
|
# set output parameters
|
302
299
|
if input_filepath == output_filepath:
|
303
|
-
raise ValueError(
|
304
|
-
"input_filepath must be different from output_filepath."
|
305
|
-
)
|
300
|
+
raise ValueError("input_filepath must be different from output_filepath.")
|
306
301
|
file_info.validate_output_file(output_filepath)
|
307
302
|
|
308
303
|
args = []
|
@@ -320,26 +315,22 @@ class Transformer(SoxTransformer):
|
|
320
315
|
|
321
316
|
status, out, err = sox(args, input_array, True)
|
322
317
|
if status != 0:
|
323
|
-
raise SoxError(
|
324
|
-
f"Stdout: {out}\nStderr: {err}"
|
325
|
-
)
|
318
|
+
raise SoxError(f"Stdout: {out}\nStderr: {err}")
|
326
319
|
|
327
|
-
logger.info(
|
328
|
-
"Created %s with effects: %s",
|
329
|
-
output_filepath,
|
330
|
-
" ".join(self.effects_log)
|
331
|
-
)
|
320
|
+
logger.info("Created %s with effects: %s", output_filepath, " ".join(self.effects_log))
|
332
321
|
|
333
322
|
if return_output:
|
334
323
|
return status, out, err
|
335
324
|
|
336
325
|
return True, None, None
|
337
326
|
|
338
|
-
def build_array(
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
327
|
+
def build_array(
|
328
|
+
self,
|
329
|
+
input_filepath: str | Path | None = None,
|
330
|
+
input_array: np.ndarray | None = None,
|
331
|
+
sample_rate_in: int | None = None,
|
332
|
+
extra_args: list[str] | None = None,
|
333
|
+
) -> np.ndarray:
|
343
334
|
"""Given an input file or array, returns the output as a numpy array
|
344
335
|
by executing the current set of commands. By default, the array will
|
345
336
|
have the same sample rate as the input file unless otherwise specified
|
@@ -405,62 +396,57 @@ class Transformer(SoxTransformer):
|
|
405
396
|
from sox.log import logger
|
406
397
|
from sox.transform import ENCODINGS_MAPPING
|
407
398
|
|
408
|
-
input_format, input_filepath = self._parse_inputs(
|
409
|
-
input_filepath, input_array, sample_rate_in
|
410
|
-
)
|
399
|
+
input_format, input_filepath = self._parse_inputs(input_filepath, input_array, sample_rate_in)
|
411
400
|
|
412
401
|
# check if any of the below commands are part of the effects chain
|
413
|
-
ignored_commands = [
|
402
|
+
ignored_commands = ["channels", "convert"]
|
414
403
|
if set(ignored_commands) & set(self.effects_log):
|
415
404
|
logger.warning(
|
416
|
-
"When outputting to an array, channels and convert "
|
417
|
-
"effects may be ignored. Use set_output_format() to "
|
418
|
-
"specify output formats."
|
405
|
+
"When outputting to an array, channels and convert "
|
406
|
+
+ "effects may be ignored. Use set_output_format() to "
|
407
|
+
+ "specify output formats."
|
419
408
|
)
|
420
409
|
|
421
|
-
output_filepath =
|
410
|
+
output_filepath = "-"
|
422
411
|
|
423
|
-
if input_format.get(
|
412
|
+
if input_format.get("file_type") is None:
|
424
413
|
encoding_out = np.int16
|
425
414
|
else:
|
426
|
-
encoding_out = [
|
427
|
-
k for k, v in ENCODINGS_MAPPING.items()
|
428
|
-
if input_format['file_type'] == v
|
429
|
-
][0]
|
415
|
+
encoding_out = next(k for k, v in ENCODINGS_MAPPING.items() if input_format["file_type"] == v)
|
430
416
|
|
431
417
|
n_bits = np.dtype(encoding_out).itemsize * 8
|
432
418
|
|
433
419
|
output_format = {
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
420
|
+
"file_type": "raw",
|
421
|
+
"rate": sample_rate_in,
|
422
|
+
"bits": n_bits,
|
423
|
+
"channels": input_format["channels"],
|
424
|
+
"encoding": None,
|
425
|
+
"comments": None,
|
426
|
+
"append_comments": True,
|
441
427
|
}
|
442
428
|
|
443
|
-
if self.output_format.get(
|
444
|
-
output_format[
|
429
|
+
if self.output_format.get("rate") is not None:
|
430
|
+
output_format["rate"] = self.output_format["rate"]
|
445
431
|
|
446
|
-
if self.output_format.get(
|
447
|
-
output_format[
|
432
|
+
if self.output_format.get("channels") is not None:
|
433
|
+
output_format["channels"] = self.output_format["channels"]
|
448
434
|
|
449
|
-
if self.output_format.get(
|
450
|
-
n_bits = self.output_format[
|
451
|
-
output_format[
|
435
|
+
if self.output_format.get("bits") is not None:
|
436
|
+
n_bits = self.output_format["bits"]
|
437
|
+
output_format["bits"] = n_bits
|
452
438
|
|
453
439
|
match n_bits:
|
454
440
|
case 8:
|
455
|
-
encoding_out = np.int8 # type: ignore
|
441
|
+
encoding_out = np.int8 # type: ignore[assignment]
|
456
442
|
case 16:
|
457
443
|
encoding_out = np.int16
|
458
444
|
case 32:
|
459
|
-
encoding_out = np.float32 # type: ignore
|
445
|
+
encoding_out = np.float32 # type: ignore[assignment]
|
460
446
|
case 64:
|
461
|
-
encoding_out = np.float64 # type: ignore
|
447
|
+
encoding_out = np.float64 # type: ignore[assignment]
|
462
448
|
case _:
|
463
|
-
raise ValueError("invalid n_bits {}"
|
449
|
+
raise ValueError(f"invalid n_bits {n_bits}")
|
464
450
|
|
465
451
|
args = []
|
466
452
|
args.extend(self.globals)
|
@@ -477,21 +463,14 @@ class Transformer(SoxTransformer):
|
|
477
463
|
|
478
464
|
status, out, err = sox(args, input_array, False)
|
479
465
|
if status != 0:
|
480
|
-
raise SoxError(
|
481
|
-
"Stdout: {}\nStderr: {}".format(out, err)
|
482
|
-
)
|
466
|
+
raise SoxError(f"Stdout: {out}\nStderr: {err}")
|
483
467
|
|
484
468
|
out = np.frombuffer(out, dtype=encoding_out)
|
485
|
-
if output_format[
|
469
|
+
if output_format["channels"] > 1:
|
486
470
|
out = out.reshape(
|
487
|
-
(
|
488
|
-
|
489
|
-
int(len(out) / output_format['channels'])
|
490
|
-
), order='F'
|
471
|
+
(output_format["channels"], int(len(out) / output_format["channels"])),
|
472
|
+
order="F",
|
491
473
|
).T
|
492
|
-
logger.info(
|
493
|
-
"Created array with effects: %s",
|
494
|
-
" ".join(self.effects_log)
|
495
|
-
)
|
474
|
+
logger.info("Created array with effects: %s", " ".join(self.effects_log))
|
496
475
|
|
497
476
|
return out
|
@@ -11,7 +11,6 @@ def apply_augmentation(audio: AudioT, augmentation: Augmentation, frame_length:
|
|
11
11
|
:param frame_length: Pad resulting audio to be a multiple of this
|
12
12
|
:return: Augmented audio
|
13
13
|
"""
|
14
|
-
from sonusai import SonusAIError
|
15
14
|
from .augmentation import pad_audio_to_frame
|
16
15
|
from .constants import BIT_DEPTH
|
17
16
|
from .constants import CHANNEL_COUNT
|
@@ -45,7 +44,7 @@ def apply_augmentation(audio: AudioT, augmentation: Augmentation, frame_length:
|
|
45
44
|
has_effects = True
|
46
45
|
|
47
46
|
if augmentation.tempo is not None:
|
48
|
-
tfm.tempo(factor=float(augmentation.tempo), audio_type=
|
47
|
+
tfm.tempo(factor=float(augmentation.tempo), audio_type="s")
|
49
48
|
has_effects = True
|
50
49
|
|
51
50
|
if augmentation.eq1 is not None:
|
@@ -70,7 +69,7 @@ def apply_augmentation(audio: AudioT, augmentation: Augmentation, frame_length:
|
|
70
69
|
audio_out = audio
|
71
70
|
|
72
71
|
except Exception as e:
|
73
|
-
raise
|
72
|
+
raise RuntimeError(f"Error applying {augmentation}: {e}") from e
|
74
73
|
|
75
74
|
# make sure length is multiple of frame_length
|
76
75
|
return pad_audio_to_frame(audio=audio_out, frame_length=frame_length)
|
@@ -84,13 +83,13 @@ def apply_impulse_response(audio: AudioT, ir: ImpulseResponseData) -> AudioT:
|
|
84
83
|
:return: Augmented audio
|
85
84
|
"""
|
86
85
|
import math
|
87
|
-
from pathlib import Path
|
88
86
|
import tempfile
|
87
|
+
from pathlib import Path
|
89
88
|
|
90
89
|
import numpy as np
|
91
90
|
|
92
|
-
from sonusai import SonusAIError
|
93
91
|
from sonusai.utils import linear_to_db
|
92
|
+
|
94
93
|
from .constants import SAMPLE_RATE
|
95
94
|
from .sox_audio import Transformer
|
96
95
|
|
@@ -111,9 +110,9 @@ def apply_impulse_response(audio: AudioT, ir: ImpulseResponseData) -> AudioT:
|
|
111
110
|
audio_out = np.pad(array=audio_out, pad_width=(pad, pad))
|
112
111
|
|
113
112
|
# Write coefficients to temporary file
|
114
|
-
temp = tempfile.NamedTemporaryFile(mode=
|
113
|
+
temp = tempfile.NamedTemporaryFile(mode="w+t")
|
115
114
|
for d in ir.data:
|
116
|
-
temp.write(f
|
115
|
+
temp.write(f"{d:f}\n")
|
117
116
|
temp.seek(0)
|
118
117
|
|
119
118
|
# Apply IR and convert back to global sample rate
|
@@ -123,7 +122,7 @@ def apply_impulse_response(audio: AudioT, ir: ImpulseResponseData) -> AudioT:
|
|
123
122
|
try:
|
124
123
|
audio_out = tfm.build_array(input_array=audio_out, sample_rate_in=ir.sample_rate)
|
125
124
|
except Exception as e:
|
126
|
-
raise
|
125
|
+
raise RuntimeError(f"Error applying IR: {e}") from e
|
127
126
|
|
128
127
|
path = Path(temp.name)
|
129
128
|
temp.close()
|
@@ -134,4 +133,4 @@ def apply_impulse_response(audio: AudioT, ir: ImpulseResponseData) -> AudioT:
|
|
134
133
|
tfm.norm(db_level=max_db)
|
135
134
|
audio_out = tfm.build_array(input_array=audio_out, sample_rate_in=SAMPLE_RATE)
|
136
135
|
|
137
|
-
return audio_out[:len(audio)]
|
136
|
+
return audio_out[: len(audio)]
|
sonusai/mixture/spectral_mask.py
CHANGED
@@ -2,7 +2,7 @@ from sonusai.mixture.datatypes import AudioF
|
|
2
2
|
from sonusai.mixture.datatypes import SpectralMask
|
3
3
|
|
4
4
|
|
5
|
-
def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int = None) -> AudioF:
|
5
|
+
def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int | None = None) -> AudioF:
|
6
6
|
"""Apply frequency and time masking
|
7
7
|
|
8
8
|
Implementation of SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
@@ -24,10 +24,8 @@ def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int
|
|
24
24
|
"""
|
25
25
|
import numpy as np
|
26
26
|
|
27
|
-
from sonusai import SonusAIError
|
28
|
-
|
29
27
|
if audio_f.ndim != 2:
|
30
|
-
raise
|
28
|
+
raise ValueError("feature input must have three dimensions [frames, bins]")
|
31
29
|
|
32
30
|
frames, bins = audio_f.shape
|
33
31
|
|
@@ -41,13 +39,13 @@ def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int
|
|
41
39
|
for _ in range(spectral_mask.f_num):
|
42
40
|
f_width = int(rng.uniform(0, f_max_width))
|
43
41
|
f_start = rng.integers(0, bins - f_width, endpoint=True)
|
44
|
-
audio_f[:, f_start:f_start + f_width] = 0
|
42
|
+
audio_f[:, f_start : f_start + f_width] = 0
|
45
43
|
|
46
44
|
# apply t_num time masks to the feature
|
47
45
|
t_upper_bound = int(spectral_mask.t_max_percent / 100 * frames)
|
48
46
|
for _ in range(spectral_mask.t_num):
|
49
47
|
t_width = min(int(rng.uniform(0, spectral_mask.t_max_width)), t_upper_bound)
|
50
48
|
t_start = rng.integers(0, frames - t_width, endpoint=True)
|
51
|
-
audio_f[t_start:t_start + t_width, :] = 0
|
49
|
+
audio_f[t_start : t_start + t_width, :] = 0
|
52
50
|
|
53
51
|
return audio_f
|