sonusai 0.19.10__py3-none-any.whl → 0.20.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,8 @@
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+ from sox import Transformer as SoxTransformer
5
+
1
6
  from sonusai.mixture.datatypes import AudioStatsMetrics
2
7
  from sonusai.mixture.datatypes import AudioT
3
8
 
@@ -12,7 +17,6 @@ def _convert_str_with_factors_to_int(x: str) -> int:
12
17
 
13
18
  def calc_audio_stats(audio: AudioT, win_len: float | None = None) -> AudioStatsMetrics:
14
19
  from sonusai.mixture import SAMPLE_RATE
15
- from sonusai.mixture import Transformer
16
20
 
17
21
  args = ["stats"]
18
22
  if win_len is not None:
@@ -53,3 +57,275 @@ def calc_audio_stats(audio: AudioT, win_len: float | None = None) -> AudioStatsM
53
57
  fl=float(stats["Flat factor"]),
54
58
  pkc=_convert_str_with_factors_to_int(stats["Pk count"]),
55
59
  )
60
+
61
+
62
+ class Transformer(SoxTransformer):
63
+ """Override certain sox.Transformer methods"""
64
+
65
+ def build( # pyright: ignore [reportIncompatibleMethodOverride]
66
+ self,
67
+ input_filepath: str | Path | None = None,
68
+ output_filepath: str | Path | None = None,
69
+ input_array: np.ndarray | None = None,
70
+ sample_rate_in: float | None = None,
71
+ extra_args: list[str] | None = None,
72
+ return_output: bool = False,
73
+ ) -> tuple[bool, str | None, str | None]:
74
+ """Given an input file or array, creates an output_file on disk by
75
+ executing the current set of commands. This function returns True on
76
+ success. If return_output is True, this function returns a triple of
77
+ (status, out, err), giving the success state, along with stdout and
78
+ stderr returned by sox.
79
+
80
+ Parameters
81
+ ----------
82
+ input_filepath : str or None
83
+ Either path to input audio file or None for array input.
84
+ output_filepath : str
85
+ Path to desired output file. If a file already exists at
86
+ the given path, the file will be overwritten.
87
+ If '-n', no file is created.
88
+ input_array : np.ndarray or None
89
+ An np.ndarray of an waveform with shape (n_samples, n_channels).
90
+ sample_rate_in must also be provided.
91
+ If None, input_filepath must be specified.
92
+ sample_rate_in : int
93
+ Sample rate of input_array.
94
+ This argument is ignored if input_array is None.
95
+ extra_args : list or None, default=None
96
+ If a list is given, these additional arguments are passed to SoX
97
+ at the end of the list of effects.
98
+ Don't use this argument unless you know exactly what you're doing!
99
+ return_output : bool, default=False
100
+ If True, returns the status and information sent to stderr and
101
+ stdout as a tuple (status, stdout, stderr).
102
+ If output_filepath is None, return_output=True by default.
103
+ If False, returns True on success.
104
+
105
+ Returns
106
+ -------
107
+ status : bool
108
+ True on success.
109
+ out : str (optional)
110
+ This is not returned unless return_output is True.
111
+ When returned, captures the stdout produced by sox.
112
+ err : str (optional)
113
+ This is not returned unless return_output is True.
114
+ When returned, captures the stderr produced by sox.
115
+
116
+ Examples
117
+ --------
118
+ > import numpy as np
119
+ > import sox
120
+ > tfm = sox.Transformer()
121
+ > sample_rate = 44100
122
+ > y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
123
+
124
+ file in, file out - basic usage
125
+
126
+ > status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
127
+
128
+ file in, file out - equivalent usage
129
+
130
+ > status = tfm.build(
131
+ input_filepath='path/to/input.wav',
132
+ output_filepath='path/to/output.mp3'
133
+ )
134
+
135
+ array in, file out
136
+
137
+ > status = tfm.build(
138
+ input_array=y, sample_rate_in=sample_rate,
139
+ output_filepath='path/to/output.mp3'
140
+ )
141
+
142
+ """
143
+ from sox import file_info
144
+ from sox.core import SoxError
145
+ from sox.core import sox
146
+ from sox.log import logger
147
+
148
+ input_format, input_filepath = self._parse_inputs(input_filepath, input_array, sample_rate_in)
149
+
150
+ if output_filepath is None:
151
+ raise ValueError("output_filepath is not specified!")
152
+
153
+ # set output parameters
154
+ if input_filepath == output_filepath:
155
+ raise ValueError("input_filepath must be different from output_filepath.")
156
+ file_info.validate_output_file(output_filepath)
157
+
158
+ args = []
159
+ args.extend(self.globals)
160
+ args.extend(self._input_format_args(input_format))
161
+ args.append(input_filepath)
162
+ args.extend(self._output_format_args(self.output_format))
163
+ args.append(output_filepath)
164
+ args.extend(self.effects)
165
+
166
+ if extra_args is not None:
167
+ if not isinstance(extra_args, list):
168
+ raise ValueError("extra_args must be a list.")
169
+ args.extend(extra_args)
170
+
171
+ status, out, err = sox(args, input_array, True)
172
+ if status != 0:
173
+ raise SoxError(f"Stdout: {out}\nStderr: {err}")
174
+
175
+ logger.info("Created %s with effects: %s", output_filepath, " ".join(self.effects_log))
176
+
177
+ if return_output:
178
+ return status, out, err # pyright: ignore [reportReturnType]
179
+
180
+ return True, None, None
181
+
182
+ def build_array( # pyright: ignore [reportIncompatibleMethodOverride]
183
+ self,
184
+ input_filepath: str | Path | None = None,
185
+ input_array: np.ndarray | None = None,
186
+ sample_rate_in: int | None = None,
187
+ extra_args: list[str] | None = None,
188
+ ) -> np.ndarray:
189
+ """Given an input file or array, returns the output as a numpy array
190
+ by executing the current set of commands. By default, the array will
191
+ have the same sample rate as the input file unless otherwise specified
192
+ using set_output_format. Functions such as channels and convert
193
+ will be ignored!
194
+
195
+ The SonusAI override does not generate a warning for rate transforms.
196
+
197
+ Parameters
198
+ ----------
199
+ input_filepath : str, Path or None
200
+ Either path to input audio file or None.
201
+ input_array : np.ndarray or None
202
+ A np.ndarray of a waveform with shape (n_samples, n_channels).
203
+ If this argument is passed, sample_rate_in must also be provided.
204
+ If None, input_filepath must be specified.
205
+ sample_rate_in : int
206
+ Sample rate of input_array.
207
+ This argument is ignored if input_array is None.
208
+ extra_args : list or None, default=None
209
+ If a list is given, these additional arguments are passed to SoX
210
+ at the end of the list of effects.
211
+ Don't use this argument unless you know exactly what you're doing!
212
+
213
+ Returns
214
+ -------
215
+ output_array : np.ndarray
216
+ Output audio as a numpy array
217
+
218
+ Examples
219
+ --------
220
+
221
+ > import numpy as np
222
+ > import sox
223
+ > tfm = sox.Transformer()
224
+ > sample_rate = 44100
225
+ > y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
226
+
227
+ file in, array out
228
+
229
+ > output_array = tfm.build(input_filepath='path/to/input.wav')
230
+
231
+ array in, array out
232
+
233
+ > output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
234
+
235
+ specifying the output sample rate
236
+
237
+ > tfm.set_output_format(rate=8000)
238
+ > output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
239
+
240
+ if an effect changes the number of channels, you must explicitly
241
+ specify the number of output channels
242
+
243
+ > tfm.remix(remix_dictionary={1: [1], 2: [1], 3: [1]})
244
+ > tfm.set_output_format(channels=3)
245
+ > output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
246
+
247
+
248
+ """
249
+ from sox.core import SoxError
250
+ from sox.core import sox
251
+ from sox.log import logger
252
+ from sox.transform import ENCODINGS_MAPPING
253
+
254
+ input_format, input_filepath = self._parse_inputs(input_filepath, input_array, sample_rate_in)
255
+
256
+ # check if any of the below commands are part of the effects chain
257
+ ignored_commands = ["channels", "convert"]
258
+ if set(ignored_commands) & set(self.effects_log):
259
+ logger.warning(
260
+ "When outputting to an array, channels and convert "
261
+ + "effects may be ignored. Use set_output_format() to "
262
+ + "specify output formats."
263
+ )
264
+
265
+ output_filepath = "-"
266
+
267
+ if input_format.get("file_type") is None:
268
+ encoding_out = np.int16
269
+ else:
270
+ encoding_out = next(k for k, v in ENCODINGS_MAPPING.items() if input_format["file_type"] == v)
271
+
272
+ n_bits = np.dtype(encoding_out).itemsize * 8
273
+
274
+ output_format = {
275
+ "file_type": "raw",
276
+ "rate": sample_rate_in,
277
+ "bits": n_bits,
278
+ "channels": input_format["channels"],
279
+ "encoding": None,
280
+ "comments": None,
281
+ "append_comments": True,
282
+ }
283
+
284
+ if self.output_format.get("rate") is not None:
285
+ output_format["rate"] = self.output_format["rate"]
286
+
287
+ if self.output_format.get("channels") is not None:
288
+ output_format["channels"] = self.output_format["channels"]
289
+
290
+ if self.output_format.get("bits") is not None:
291
+ n_bits = self.output_format["bits"]
292
+ output_format["bits"] = n_bits
293
+
294
+ match n_bits:
295
+ case 8:
296
+ encoding_out = np.int8 # type: ignore[assignment]
297
+ case 16:
298
+ encoding_out = np.int16
299
+ case 32:
300
+ encoding_out = np.float32 # type: ignore[assignment]
301
+ case 64:
302
+ encoding_out = np.float64 # type: ignore[assignment]
303
+ case _:
304
+ raise ValueError(f"invalid n_bits {n_bits}")
305
+
306
+ args = []
307
+ args.extend(self.globals)
308
+ args.extend(self._input_format_args(input_format))
309
+ args.append(input_filepath)
310
+ args.extend(self._output_format_args(output_format))
311
+ args.append(output_filepath)
312
+ args.extend(self.effects)
313
+
314
+ if extra_args is not None:
315
+ if not isinstance(extra_args, list):
316
+ raise ValueError("extra_args must be a list.")
317
+ args.extend(extra_args)
318
+
319
+ status, out, err = sox(args, input_array, False)
320
+ if status != 0:
321
+ raise SoxError(f"Stdout: {out}\nStderr: {err}")
322
+
323
+ out = np.frombuffer(out, dtype=encoding_out) # pyright: ignore [reportArgumentType, reportCallIssue]
324
+ if output_format["channels"] > 1:
325
+ out = out.reshape(
326
+ (output_format["channels"], int(len(out) / output_format["channels"])),
327
+ order="F",
328
+ ).T
329
+ logger.info("Created array with effects: %s", " ".join(self.effects_log))
330
+
331
+ return out
@@ -137,7 +137,9 @@ def main() -> None:
137
137
  print(f"Could not open SonusAI mixture database in {location}, exiting ...")
138
138
  return
139
139
 
140
- metrics_present = mixdb.cached_metrics()
140
+ # Only check first and last mixture in order to save time
141
+ metrics_present = mixdb.cached_metrics([0, mixdb.num_mixtures - 1])
142
+
141
143
  num_metrics_present = len(metrics_present)
142
144
  if num_metrics_present < 1:
143
145
  print(f"mixdb reports no pre-generated metrics are present. Nothing to summarize in {location}, exiting ...")
@@ -150,7 +152,7 @@ def main() -> None:
150
152
  create_file_handler(join(location, "metrics_summary.log"))
151
153
  update_console_handler(verbose)
152
154
  initial_log_messages("metrics_summary")
153
- logger.info(f"Logging summary of SonusAI mixture db at {location}")
155
+ logger.info(f"Logging summary of SonusAI mixture database at {location}")
154
156
  else:
155
157
  update_console_handler(verbose)
156
158
 
@@ -164,7 +166,7 @@ def main() -> None:
164
166
  fsuffix = f"_s{len(mixids)}t{mixdb.num_mixtures}"
165
167
  else:
166
168
  logger.info(
167
- f"Summarizing SonusAI mixture db with {mixdb.num_mixtures} mixtures "
169
+ f"Summarizing SonusAI mixture database with {mixdb.num_mixtures} mixtures "
168
170
  f"and {num_metrics_present} pre-generated metrics ..."
169
171
  )
170
172
  fsuffix = ""
@@ -5,6 +5,7 @@ from .audio import get_duration
5
5
  from .audio import get_next_noise
6
6
  from .audio import get_num_samples
7
7
  from .audio import get_sample_rate
8
+ from .audio import raw_read_audio
8
9
  from .audio import read_audio
9
10
  from .audio import read_ir
10
11
  from .audio import validate_input_file
@@ -53,7 +54,9 @@ from .datatypes import AudioF
53
54
  from .datatypes import AudioStatsMetrics
54
55
  from .datatypes import AudioT
55
56
  from .datatypes import Augmentation
57
+ from .datatypes import AugmentationEffects
56
58
  from .datatypes import AugmentationRule
59
+ from .datatypes import AugmentationRuleEffects
57
60
  from .datatypes import AugmentedTarget
58
61
  from .datatypes import ClassCount
59
62
  from .datatypes import EnergyF
@@ -111,10 +114,10 @@ from .helpers import get_transform_from_audio
111
114
  from .helpers import inverse_transform
112
115
  from .helpers import mixture_metadata
113
116
  from .helpers import write_mixture_metadata
117
+ from .ir_delay import get_impulse_response_delay
114
118
  from .log_duration_and_sizes import log_duration_and_sizes
115
119
  from .mixdb import MixtureDatabase
116
120
  from .mixdb import db_file
117
- from .sox_audio import Transformer
118
121
  from .spectral_mask import apply_spectral_mask
119
122
  from .target_class_balancing import balance_targets
120
123
  from .targets import get_augmented_target_ids_by_class
sonusai/mixture/audio.py CHANGED
@@ -58,9 +58,62 @@ def get_sample_rate(name: str | Path, use_cache: bool = True) -> int:
58
58
 
59
59
  @lru_cache
60
60
  def _get_sample_rate(name: str | Path) -> int:
61
- from .soundfile_audio import get_sample_rate
61
+ """Get sample rate from audio file using soundfile
62
62
 
63
- return get_sample_rate(name)
63
+ :param name: File name
64
+ :return: Sample rate
65
+ """
66
+ import soundfile
67
+ from pydub import AudioSegment
68
+
69
+ from .tokenized_shell_vars import tokenized_expand
70
+
71
+ expanded_name, _ = tokenized_expand(name)
72
+
73
+ try:
74
+ if expanded_name.endswith(".mp3"):
75
+ return AudioSegment.from_mp3(expanded_name).frame_rate
76
+
77
+ if expanded_name.endswith(".m4a"):
78
+ return AudioSegment.from_file(expanded_name).frame_rate
79
+
80
+ return soundfile.info(expanded_name).samplerate
81
+ except Exception as e:
82
+ if name != expanded_name:
83
+ raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
84
+ else:
85
+ raise OSError(f"Error reading {name}: {e}") from e
86
+
87
+
88
+ def raw_read_audio(name: str | Path) -> tuple[AudioT, int]:
89
+ import numpy as np
90
+ import soundfile
91
+ from pydub import AudioSegment
92
+
93
+ from .tokenized_shell_vars import tokenized_expand
94
+
95
+ expanded_name, _ = tokenized_expand(name)
96
+
97
+ try:
98
+ if expanded_name.endswith(".mp3"):
99
+ sound = AudioSegment.from_mp3(expanded_name)
100
+ raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
101
+ raw = raw / 2 ** (sound.sample_width * 8 - 1)
102
+ sample_rate = sound.frame_rate
103
+ elif expanded_name.endswith(".m4a"):
104
+ sound = AudioSegment.from_file(expanded_name)
105
+ raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
106
+ raw = raw / 2 ** (sound.sample_width * 8 - 1)
107
+ sample_rate = sound.frame_rate
108
+ else:
109
+ raw, sample_rate = soundfile.read(expanded_name, always_2d=True, dtype="float32")
110
+ except Exception as e:
111
+ if name != expanded_name:
112
+ raise OSError(f"Error reading {name} (expanded: {expanded_name}): {e}") from e
113
+ else:
114
+ raise OSError(f"Error reading {name}: {e}") from e
115
+
116
+ return np.squeeze(raw[:, 0].astype(np.float32)), sample_rate
64
117
 
65
118
 
66
119
  def read_audio(name: str | Path, use_cache: bool = True) -> AudioT:
@@ -77,28 +130,45 @@ def read_audio(name: str | Path, use_cache: bool = True) -> AudioT:
77
130
 
78
131
  @lru_cache
79
132
  def _read_audio(name: str | Path) -> AudioT:
80
- from .soundfile_audio import read_audio
133
+ """Read audio data from a file using soundfile
134
+
135
+ :param name: File name
136
+ :return: Array of time domain audio data
137
+ """
138
+ import librosa
139
+
140
+ from .constants import SAMPLE_RATE
141
+
142
+ out, sample_rate = raw_read_audio(name)
143
+ out = librosa.resample(out, orig_sr=sample_rate, target_sr=SAMPLE_RATE, res_type="soxr_hq")
81
144
 
82
- return read_audio(name)
145
+ return out
83
146
 
84
147
 
85
- def read_ir(name: str | Path, use_cache: bool = True) -> ImpulseResponseData:
148
+ def read_ir(name: str | Path, delay: int, use_cache: bool = True) -> ImpulseResponseData:
86
149
  """Read impulse response data
87
150
 
88
151
  :param name: File name
152
+ :param delay: Delay in samples
89
153
  :param use_cache: If true, use LRU caching
90
154
  :return: ImpulseResponseData object
91
155
  """
92
156
  if use_cache:
93
- return _read_ir(name)
94
- return _read_ir.__wrapped__(name)
157
+ return _read_ir(name, delay)
158
+ return _read_ir.__wrapped__(name, delay)
95
159
 
96
160
 
97
161
  @lru_cache
98
- def _read_ir(name: str | Path) -> ImpulseResponseData:
99
- from .soundfile_audio import read_ir
162
+ def _read_ir(name: str | Path, delay: int) -> ImpulseResponseData:
163
+ """Read impulse response data using soundfile
100
164
 
101
- return read_ir(name)
165
+ :param name: File name
166
+ :param delay: Delay in samples
167
+ :return: ImpulseResponseData object
168
+ """
169
+ out, sample_rate = raw_read_audio(name)
170
+
171
+ return ImpulseResponseData(data=out, sample_rate=sample_rate, delay=delay)
102
172
 
103
173
 
104
174
  def get_num_samples(name: str | Path, use_cache: bool = True) -> int:
@@ -120,6 +190,27 @@ def _get_num_samples(name: str | Path) -> int:
120
190
  :param name: File name
121
191
  :return: number of samples in resampled audio
122
192
  """
123
- from .soundfile_audio import get_num_samples
193
+ import math
194
+
195
+ import soundfile
196
+ from pydub import AudioSegment
124
197
 
125
- return get_num_samples(name)
198
+ from .constants import SAMPLE_RATE
199
+ from .tokenized_shell_vars import tokenized_expand
200
+
201
+ expanded_name, _ = tokenized_expand(name)
202
+
203
+ if expanded_name.endswith(".mp3"):
204
+ sound = AudioSegment.from_mp3(expanded_name)
205
+ samples = sound.frame_count()
206
+ sample_rate = sound.frame_rate
207
+ elif expanded_name.endswith(".m4a"):
208
+ sound = AudioSegment.from_file(expanded_name)
209
+ samples = sound.frame_count()
210
+ sample_rate = sound.frame_rate
211
+ else:
212
+ info = soundfile.info(name)
213
+ samples = info.frames
214
+ sample_rate = info.samplerate
215
+
216
+ return math.ceil(SAMPLE_RATE * samples / sample_rate)