sonusai 0.15.6__py3-none-any.whl → 0.15.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  from typing import Any
2
2
 
3
+ from pyaaware import ForwardTransform
4
+ from pyaaware import InverseTransform
5
+
6
+ from sonusai.mixture import EnergyT
3
7
  from sonusai.mixture.datatypes import AudioF
4
8
  from sonusai.mixture.datatypes import AudioT
5
9
  from sonusai.mixture.datatypes import AudiosT
@@ -78,7 +82,7 @@ def get_feature_generator_info(fg_config: FeatureGeneratorConfig) -> FeatureGene
78
82
  decimation=fg.decimation,
79
83
  stride=fg.stride,
80
84
  step=fg.step,
81
- num_bands=fg.num_bands,
85
+ feature_parameters=fg.feature_parameters,
82
86
  ft_config=TransformConfig(N=fg.ftransform_N,
83
87
  R=fg.ftransform_R,
84
88
  bin_start=fg.bin_start,
@@ -327,15 +331,14 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
327
331
  import numpy as np
328
332
  from pyaaware import FeatureGenerator
329
333
 
330
- from .spectral_mask import apply_spectral_mask
331
334
  from .truth import truth_reduction
332
335
 
333
- mixture_f = get_mixture_f(mixdb=mixdb, mixture_audio=mixture_audio)
336
+ mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
334
337
 
335
338
  transform_frames = mixdb.mixture_transform_frames(mixture.samples)
336
339
  feature_frames = mixdb.mixture_feature_frames(mixture.samples)
337
340
 
338
- feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.fg_num_bands), dtype=np.float32)
341
+ feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.feature_parameters), dtype=np.float32)
339
342
  truth_f = np.empty((feature_frames, mixdb.num_classes), dtype=np.complex64)
340
343
 
341
344
  fg = FeatureGenerator(**asdict(mixdb.fg_config))
@@ -350,11 +353,6 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
350
353
  truth_f[feature_frame] = fg.truth()
351
354
  feature_frame += 1
352
355
 
353
- if mixture.spectral_mask_id is not None:
354
- feature = apply_spectral_mask(feature=feature,
355
- spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
356
- seed=mixture.spectral_mask_seed)
357
-
358
356
  if np.isreal(truth_f).all():
359
357
  return feature, truth_f.real
360
358
 
@@ -444,14 +442,35 @@ def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT)
444
442
  return np.sum(targets_ir, axis=0)
445
443
 
446
444
 
447
- def get_mixture_f(mixdb: MixtureDatabase, mixture_audio: AudioT) -> AudioF:
445
+ def get_mixture_f(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT) -> AudioF:
448
446
  """Get the mixture transform for the given mixture
449
447
 
450
448
  :param mixdb: Mixture database
449
+ :param mixture: Mixture record
451
450
  :param mixture_audio: Mixture audio data for the given mixid
452
451
  :return: Mixture transform data
453
452
  """
454
- return forward_transform(mixture_audio, mixdb.ft_config)
453
+ from .spectral_mask import apply_spectral_mask
454
+
455
+ mixture_f = forward_transform(mixture_audio, mixdb.ft_config)
456
+
457
+ if mixture.spectral_mask_id is not None:
458
+ mixture_f = apply_spectral_mask(audio_f=mixture_f,
459
+ spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
460
+ seed=mixture.spectral_mask_seed)
461
+
462
+ return mixture_f
463
+
464
+
465
+ def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tuple[AudioF, EnergyT]:
466
+ """Apply forward transform to input audio data to generate transform data
467
+
468
+ :param audio: Time domain data [samples]
469
+ :param transform: ForwardTransform object
470
+ :return: Frequency domain data [frames, bins], Energy [frames]
471
+ """
472
+ f, e = transform.execute_all(audio)
473
+ return f.transpose(), e
455
474
 
456
475
 
457
476
  def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
@@ -465,17 +484,30 @@ def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
465
484
  """
466
485
  from pyaaware import AawareForwardTransform
467
486
 
468
- from .audio import calculate_transform_from_audio
469
-
470
- audio_f, _ = calculate_transform_from_audio(audio=audio,
471
- transform=AawareForwardTransform(N=config.N,
472
- R=config.R,
473
- bin_start=config.bin_start,
474
- bin_end=config.bin_end,
475
- ttype=config.ttype))
487
+ audio_f, _ = get_transform_from_audio(audio=audio,
488
+ transform=AawareForwardTransform(N=config.N,
489
+ R=config.R,
490
+ bin_start=config.bin_start,
491
+ bin_end=config.bin_end,
492
+ ttype=config.ttype))
476
493
  return audio_f
477
494
 
478
495
 
496
+ def get_audio_from_transform(data: AudioF, transform: InverseTransform, trim: bool = True) -> tuple[AudioT, EnergyT]:
497
+ """Apply inverse transform to input transform data to generate audio data
498
+
499
+ :param data: Frequency domain data [frames, bins]
500
+ :param transform: InverseTransform object
501
+ :param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the transform
502
+ :return: Time domain data [samples], Energy [frames]
503
+ """
504
+ t, e = transform.execute_all(data.transpose())
505
+ if trim:
506
+ t = t[transform.N - transform.R:]
507
+
508
+ return t, e
509
+
510
+
479
511
  def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = True) -> AudioT:
480
512
  """Transform frequency domain data into time domain using the inverse transform config from the feature
481
513
 
@@ -490,16 +522,14 @@ def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = T
490
522
  import numpy as np
491
523
  from pyaaware import AawareInverseTransform
492
524
 
493
- from .audio import calculate_audio_from_transform
494
-
495
- audio, _ = calculate_audio_from_transform(data=transform,
496
- transform=AawareInverseTransform(N=config.N,
497
- R=config.R,
498
- bin_start=config.bin_start,
499
- bin_end=config.bin_end,
500
- ttype=config.ttype,
501
- gain=np.float32(1)),
502
- trim=trim)
525
+ audio, _ = get_audio_from_transform(data=transform,
526
+ transform=AawareInverseTransform(N=config.N,
527
+ R=config.R,
528
+ bin_start=config.bin_start,
529
+ bin_end=config.bin_end,
530
+ ttype=config.ttype,
531
+ gain=np.float32(1)),
532
+ trim=trim)
503
533
  return audio
504
534
 
505
535
 
@@ -534,7 +564,7 @@ def augmented_target_samples(target_files: TargetFiles,
534
564
  it = list(product(*[target_ids, target_augmentation_ids]))
535
565
  return sum([estimate_augmented_length_from_length(
536
566
  length=target_files[fi].samples,
537
- tempo=target_augmentations[ai].tempo,
567
+ tempo=float(target_augmentations[ai].tempo),
538
568
  frame_length=feature_step_samples) for fi, ai, in it])
539
569
 
540
570
 
@@ -1,7 +1,7 @@
1
1
  def log_duration_and_sizes(total_duration: float,
2
2
  num_classes: int,
3
3
  feature_step_samples: int,
4
- num_bands: int,
4
+ feature_parameters: int,
5
5
  stride: int,
6
6
  desc: str) -> None:
7
7
  from sonusai import logger
@@ -14,7 +14,7 @@ def log_duration_and_sizes(total_duration: float,
14
14
  total_samples = int(total_duration * SAMPLE_RATE)
15
15
  mixture_bytes = total_samples * SAMPLE_BYTES
16
16
  truth_t_bytes = total_samples * num_classes * FLOAT_BYTES
17
- feature_bytes = total_samples / feature_step_samples * stride * num_bands * FLOAT_BYTES
17
+ feature_bytes = total_samples / feature_step_samples * stride * feature_parameters * FLOAT_BYTES
18
18
  truth_f_bytes = total_samples / feature_step_samples * num_classes * FLOAT_BYTES
19
19
 
20
20
  logger.info('')
sonusai/mixture/mixdb.py CHANGED
@@ -248,8 +248,8 @@ class MixtureDatabase:
248
248
  return self.fg_info.step
249
249
 
250
250
  @cached_property
251
- def fg_num_bands(self) -> int:
252
- return self.fg_info.num_bands
251
+ def feature_parameters(self) -> int:
252
+ return self.fg_info.feature_parameters
253
253
 
254
254
  @cached_property
255
255
  def ft_config(self) -> TransformConfig:
@@ -809,11 +809,20 @@ class MixtureDatabase:
809
809
  :return: Mixture transform data
810
810
  """
811
811
  from .helpers import forward_transform
812
+ from .spectral_mask import apply_spectral_mask
812
813
 
813
814
  if force or mixture is None:
814
815
  mixture = self.mixture_mixture(m_id, targets, target, noise, force)
815
816
 
816
- return forward_transform(mixture, self.ft_config)
817
+ mixture_f = forward_transform(mixture, self.ft_config)
818
+
819
+ m = self.mixture(m_id)
820
+ if m.spectral_mask_id is not None:
821
+ mixture_f = apply_spectral_mask(audio_f=mixture_f,
822
+ spectral_mask=self.spectral_mask(int(m.spectral_mask_id)),
823
+ seed=m.spectral_mask_seed)
824
+
825
+ return mixture_f
817
826
 
818
827
  def mixture_truth_t(self,
819
828
  m_id: int,
@@ -938,7 +947,6 @@ class MixtureDatabase:
938
947
  import numpy as np
939
948
  from pyaaware import FeatureGenerator
940
949
 
941
- from .spectral_mask import apply_spectral_mask
942
950
  from .truth import truth_reduction
943
951
 
944
952
  if not force:
@@ -964,7 +972,7 @@ class MixtureDatabase:
964
972
  if truth_t is None:
965
973
  truth_t = np.zeros((m.samples, self.num_classes), dtype=np.float32)
966
974
 
967
- feature = np.empty((feature_frames, self.fg_stride, self.fg_num_bands), dtype=np.float32)
975
+ feature = np.empty((feature_frames, self.fg_stride, self.feature_parameters), dtype=np.float32)
968
976
  truth_f = np.empty((feature_frames, self.num_classes), dtype=np.complex64)
969
977
 
970
978
  fg = FeatureGenerator(**asdict(self.fg_config))
@@ -979,11 +987,6 @@ class MixtureDatabase:
979
987
  truth_f[feature_frame] = fg.truth()
980
988
  feature_frame += 1
981
989
 
982
- if m.spectral_mask_id is not None:
983
- feature = apply_spectral_mask(feature=feature,
984
- spectral_mask=self.spectral_mask(int(m.spectral_mask_id)),
985
- seed=m.spectral_mask_seed)
986
-
987
990
  if np.isreal(truth_f).all():
988
991
  return feature, truth_f.real
989
992
 
@@ -1,23 +1,23 @@
1
- from sonusai.mixture.datatypes import Feature
1
+ from sonusai.mixture.datatypes import AudioF
2
2
  from sonusai.mixture.datatypes import SpectralMask
3
3
 
4
4
 
5
- def apply_spectral_mask(feature: Feature, spectral_mask: SpectralMask, seed: int = None) -> Feature:
5
+ def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int = None) -> AudioF:
6
6
  """Apply frequency and time masking
7
7
 
8
8
  Implementation of SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
9
9
 
10
10
  Ref: https://arxiv.org/pdf/1904.08779.pdf
11
11
 
12
- f_width consecutive bands [f_start, f_start + f_width) are masked, where f_width is chosen from a uniform
13
- distribution from 0 to the f_max_width, and f_start is chosen from [0, bands - f_width).
12
+ f_width consecutive bins [f_start, f_start + f_width) are masked, where f_width is chosen from a uniform
13
+ distribution from 0 to the f_max_width, and f_start is chosen from [0, bins - f_width).
14
14
 
15
15
  t_width consecutive frames [t_start, t_start + t_width) are masked, where t_width is chosen from a uniform
16
16
  distribution from 0 to the t_max_width, and t_start is chosen from [0, frames - t_width).
17
17
 
18
18
  A time mask cannot be wider than t_max_percent times the number of frames.
19
19
 
20
- :param feature: Numpy array of feature data [frames, strides, bands]
20
+ :param audio_f: Numpy array of transform audio data [frames, bins]
21
21
  :param spectral_mask: Spectral mask parameters
22
22
  :param seed: Random number seed
23
23
  :return: Augmented feature
@@ -26,28 +26,28 @@ def apply_spectral_mask(feature: Feature, spectral_mask: SpectralMask, seed: int
26
26
 
27
27
  from sonusai import SonusAIError
28
28
 
29
- if feature.ndim != 3:
30
- raise SonusAIError('feature input must have three dimensions [frames, strides, bands]')
29
+ if audio_f.ndim != 2:
30
+ raise SonusAIError('feature input must have three dimensions [frames, bins]')
31
31
 
32
- frames, strides, bands = feature.shape
32
+ frames, bins = audio_f.shape
33
33
 
34
34
  f_max_width = spectral_mask.f_max_width
35
- if f_max_width not in range(0, bands + 1):
36
- f_max_width = bands
35
+ if f_max_width not in range(0, bins + 1):
36
+ f_max_width = bins
37
37
 
38
38
  rng = np.random.default_rng(seed)
39
39
 
40
40
  # apply f_num frequency masks to the feature
41
41
  for _ in range(spectral_mask.f_num):
42
42
  f_width = int(rng.uniform(0, f_max_width))
43
- f_start = rng.integers(0, bands - f_width, endpoint=True)
44
- feature[:, :, f_start:f_start + f_width] = 0
43
+ f_start = rng.integers(0, bins - f_width, endpoint=True)
44
+ audio_f[:, f_start:f_start + f_width] = 0
45
45
 
46
46
  # apply t_num time masks to the feature
47
47
  t_upper_bound = int(spectral_mask.t_max_percent / 100 * frames)
48
48
  for _ in range(spectral_mask.t_num):
49
49
  t_width = min(int(rng.uniform(0, spectral_mask.t_max_width)), t_upper_bound)
50
50
  t_start = rng.integers(0, frames - t_width, endpoint=True)
51
- feature[t_start:t_start + t_width, :, :] = 0
51
+ audio_f[t_start:t_start + t_width, :] = 0
52
52
 
53
- return feature
53
+ return audio_f
@@ -23,6 +23,8 @@ class Data:
23
23
  num_classes=config.num_classes,
24
24
  truth_mutex=config.mutex)
25
25
 
26
+ self.feature_parameters = fg.feature_parameters
27
+ self.ttype = fg.ftransform_ttype
26
28
  self.frame_size = fg.ftransform_R
27
29
 
28
30
  if len(target_audio) % self.frame_size != 0:
@@ -13,12 +13,13 @@ Calculates the true transform of the target using the STFT
13
13
  configuration defined by the feature. This will include a
14
14
  forward transform window if defined by the feature.
15
15
 
16
- Output shape: [:, 2 * bins] (stacked real, imag)
16
+ Output shape: [:, num_classes]
17
+ (target stacked real, imag; or real only for tdac-co)
17
18
  """
18
19
 
19
20
  from sonusai import SonusAIError
20
21
 
21
- if data.config.num_classes != 2 * data.target_fft.bins:
22
+ if data.config.num_classes != data.feature_parameters:
22
23
  raise SonusAIError(f'Invalid num_classes for target_f truth: {data.config.num_classes}')
23
24
 
24
25
  target_freq = _execute_fft(data.target_audio, data.target_fft, len(data.offsets))
@@ -28,6 +29,7 @@ Output shape: [:, 2 * bins] (stacked real, imag)
28
29
  frame_size=data.frame_size,
29
30
  zero_based_indices=data.zero_based_indices,
30
31
  bins=data.target_fft.bins,
32
+ ttype=data.ttype,
31
33
  start=0,
32
34
  truth=data.truth)
33
35
 
@@ -43,11 +45,13 @@ using the STFT configuration defined by the feature. This
43
45
  will include a forward transform window if defined by the
44
46
  feature.
45
47
 
46
- Output shape: [:, 4 * bins] (target stacked real, imag; mixture stacked real, imag)
48
+ Output shape: [:, 2 * num_classes]
49
+ (target stacked real, imag; or real only for tdac-co)
50
+ (mixture stacked real, imag; or real only for tdac-co)
47
51
  """
48
52
  from sonusai import SonusAIError
49
53
 
50
- if data.config.num_classes != 2 * data.target_fft.bins + 2 * data.mixture_fft.bins:
54
+ if data.config.num_classes != 2 * data.feature_parameters:
51
55
  raise SonusAIError(f'Invalid num_classes for target_mixture_f truth: {data.config.num_classes}')
52
56
 
53
57
  target_freq = _execute_fft(data.target_audio, data.target_fft, len(data.offsets))
@@ -59,6 +63,7 @@ Output shape: [:, 4 * bins] (target stacked real, imag; mixture stacked real, im
59
63
  frame_size=data.frame_size,
60
64
  zero_based_indices=data.zero_based_indices,
61
65
  bins=data.target_fft.bins,
66
+ ttype=data.ttype,
62
67
  start=0,
63
68
  truth=data.truth)
64
69
 
@@ -67,6 +72,7 @@ Output shape: [:, 4 * bins] (target stacked real, imag; mixture stacked real, im
67
72
  frame_size=data.frame_size,
68
73
  zero_based_indices=data.zero_based_indices,
69
74
  bins=data.target_fft.bins,
75
+ ttype=data.ttype,
70
76
  start=data.target_fft.bins * 2,
71
77
  truth=data.truth)
72
78
 
@@ -125,6 +131,7 @@ def _stack_real_imag(data: AudioF,
125
131
  frame_size: int,
126
132
  zero_based_indices: list[int],
127
133
  bins: int,
134
+ ttype: str,
128
135
  start: int,
129
136
  truth: Truth) -> Truth:
130
137
  import numpy as np
@@ -134,7 +141,8 @@ def _stack_real_imag(data: AudioF,
134
141
  b = _get_bin_slice(index + start, bins)
135
142
  truth[i, b] = np.real(data)
136
143
 
137
- b = _get_bin_slice(b.stop, bins)
138
- truth[i, b] = np.imag(data)
144
+ if ttype != 'tdac-co':
145
+ b = _get_bin_slice(b.stop, bins)
146
+ truth[i, b] = np.imag(data)
139
147
 
140
148
  return truth
sonusai/onnx_predict.py CHANGED
@@ -105,7 +105,7 @@ def main() -> None:
105
105
  logger.info('')
106
106
  logger.info(f'Run prediction on {input_name}')
107
107
  audio = read_audio(input_name)
108
- feature = get_feature_from_audio(audio=audio, feature=model_metadata.feature)
108
+ feature = get_feature_from_audio(audio=audio, feature_mode=model_metadata.feature)
109
109
 
110
110
  predict = pad_and_predict(feature=feature,
111
111
  model_name=model_name,
sonusai/plot.py CHANGED
@@ -314,7 +314,7 @@ def main() -> None:
314
314
  raise SonusAIError('Must specify MODEL when input is WAV')
315
315
 
316
316
  mixture_audio = read_audio(input_name)
317
- feature = get_feature_from_audio(audio=mixture_audio, feature=model.feature)
317
+ feature = get_feature_from_audio(audio=mixture_audio, feature_mode=model.feature)
318
318
  fg_config = FeatureGeneratorConfig(feature_mode=model.feature,
319
319
  num_classes=model.output_shape[-1],
320
320
  truth_mutex=False)
@@ -406,11 +406,11 @@ def main() -> None:
406
406
  title = f'{input_name}'
407
407
  pdf_name = f'{base_name}-plot.pdf'
408
408
 
409
- # Original size [frames, stride, num_bands]
409
+ # Original size [frames, stride, feature_parameters]
410
410
  # Decimate in the stride dimension
411
- # Reshape to get frames*decimated_stride, num_bands
411
+ # Reshape to get frames*decimated_stride, feature_parameters
412
412
  if feature.ndim != 3:
413
- raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, num_bands')
413
+ raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, feature_parameters')
414
414
  spectrogram = feature[:, -fg_step:, :]
415
415
  spectrogram = np.reshape(spectrogram, (spectrogram.shape[0] * spectrogram.shape[1], spectrogram.shape[2]))
416
416
 
@@ -123,7 +123,7 @@ def _process(file: str) -> None:
123
123
  from pyaaware import AawareInverseTransform
124
124
 
125
125
  from sonusai import SonusAIError
126
- from sonusai.mixture import calculate_audio_from_transform
126
+ from sonusai.mixture import get_audio_from_transform
127
127
  from sonusai.utils import float_to_int16
128
128
  from sonusai.utils import unstack_complex
129
129
  from sonusai.utils import write_wav
@@ -135,13 +135,13 @@ def _process(file: str) -> None:
135
135
  raise SonusAIError(f'Error reading {file}: {e}')
136
136
 
137
137
  output_name = join(MP_GLOBAL.output_dir, splitext(basename(file))[0] + '.wav')
138
- audio, _ = calculate_audio_from_transform(data=predict,
139
- transform=AawareInverseTransform(N=MP_GLOBAL.N,
140
- R=MP_GLOBAL.R,
141
- bin_start=MP_GLOBAL.bin_start,
142
- bin_end=MP_GLOBAL.bin_end,
143
- ttype=MP_GLOBAL.ttype,
144
- gain=np.float32(1)))
138
+ audio, _ = get_audio_from_transform(data=predict,
139
+ transform=AawareInverseTransform(N=MP_GLOBAL.N,
140
+ R=MP_GLOBAL.R,
141
+ bin_start=MP_GLOBAL.bin_start,
142
+ bin_end=MP_GLOBAL.bin_end,
143
+ ttype=MP_GLOBAL.ttype,
144
+ gain=np.float32(1)))
145
145
  write_wav(name=output_name, audio=float_to_int16(audio))
146
146
 
147
147