sonusai 0.15.8__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sonusai/__init__.py +35 -4
  2. sonusai/audiofe.py +237 -0
  3. sonusai/calc_metric_spenh.py +21 -12
  4. sonusai/genft.py +2 -1
  5. sonusai/genmixdb.py +5 -5
  6. sonusai/lsdb.py +2 -2
  7. sonusai/main.py +58 -61
  8. sonusai/mixture/__init__.py +4 -2
  9. sonusai/mixture/audio.py +0 -34
  10. sonusai/mixture/config.py +1 -2
  11. sonusai/mixture/datatypes.py +1 -1
  12. sonusai/mixture/feature.py +75 -21
  13. sonusai/mixture/helpers.py +60 -30
  14. sonusai/mixture/log_duration_and_sizes.py +2 -2
  15. sonusai/mixture/mixdb.py +13 -10
  16. sonusai/mixture/spectral_mask.py +14 -14
  17. sonusai/mixture/truth_functions/data.py +1 -1
  18. sonusai/mixture/truth_functions/target.py +2 -2
  19. sonusai/mkmanifest.py +29 -2
  20. sonusai/onnx_predict.py +1 -1
  21. sonusai/plot.py +4 -4
  22. sonusai/post_spenh_targetf.py +8 -8
  23. sonusai/utils/__init__.py +8 -7
  24. sonusai/utils/asl_p56.py +3 -3
  25. sonusai/utils/asr.py +35 -8
  26. sonusai/utils/asr_functions/__init__.py +0 -5
  27. sonusai/utils/asr_functions/aaware_whisper.py +2 -2
  28. sonusai/utils/asr_manifest_functions/__init__.py +1 -0
  29. sonusai/utils/asr_manifest_functions/mcgill_speech.py +29 -0
  30. sonusai/utils/audio_devices.py +41 -0
  31. sonusai/utils/calculate_input_shape.py +3 -4
  32. sonusai/utils/create_timestamp.py +5 -0
  33. sonusai/utils/{trim_docstring.py → docstring.py} +20 -0
  34. sonusai/utils/model_utils.py +30 -0
  35. sonusai/utils/onnx_utils.py +19 -45
  36. sonusai/utils/reshape.py +11 -11
  37. sonusai/utils/wave.py +12 -5
  38. {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/METADATA +8 -19
  39. {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/RECORD +41 -54
  40. {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/WHEEL +1 -1
  41. sonusai/data_generator/__init__.py +0 -5
  42. sonusai/data_generator/dataset_from_mixdb.py +0 -143
  43. sonusai/data_generator/keras_from_mixdb.py +0 -169
  44. sonusai/data_generator/torch_from_mixdb.py +0 -122
  45. sonusai/evaluate.py +0 -245
  46. sonusai/keras_onnx.py +0 -86
  47. sonusai/keras_predict.py +0 -231
  48. sonusai/keras_train.py +0 -334
  49. sonusai/torchl_onnx.py +0 -216
  50. sonusai/torchl_predict.py +0 -547
  51. sonusai/torchl_train.py +0 -223
  52. sonusai/utils/asr_functions/aixplain_whisper.py +0 -59
  53. sonusai/utils/asr_functions/data.py +0 -16
  54. sonusai/utils/asr_functions/deepgram.py +0 -97
  55. sonusai/utils/asr_functions/fastwhisper.py +0 -90
  56. sonusai/utils/asr_functions/google.py +0 -95
  57. sonusai/utils/asr_functions/whisper.py +0 -49
  58. sonusai/utils/keras_utils.py +0 -226
  59. {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/entry_points.txt +0 -0
sonusai/mixture/config.py CHANGED
@@ -480,11 +480,10 @@ def append_noise_files(entry: dict | str, tokens: dict = None) -> list[dict]:
480
480
  return noise_files
481
481
 
482
482
 
483
- def get_impulse_response_files(config: dict, show_progress: bool = False) -> ImpulseResponseFiles:
483
+ def get_impulse_response_files(config: dict) -> ImpulseResponseFiles:
484
484
  """Get the list of impulse response files from a config
485
485
 
486
486
  :param config: Config dictionary
487
- :param show_progress: Show progress bar
488
487
  :return: List of impulse response files
489
488
  """
490
489
  from itertools import chain
@@ -304,7 +304,7 @@ class FeatureGeneratorInfo:
304
304
  decimation: int
305
305
  stride: int
306
306
  step: int
307
- num_bands: int
307
+ feature_parameters: int
308
308
  ft_config: TransformConfig
309
309
  eft_config: TransformConfig
310
310
  it_config: TransformConfig
@@ -1,51 +1,105 @@
1
+ from typing import Optional
2
+
3
+ from sonusai.mixture.datatypes import AudioF
1
4
  from sonusai.mixture.datatypes import AudioT
2
5
  from sonusai.mixture.datatypes import Feature
3
6
 
4
7
 
5
- def get_feature_from_audio(audio: AudioT, feature: str) -> Feature:
6
- from dataclasses import asdict
8
+ def get_feature_from_audio(audio: AudioT,
9
+ feature_mode: str,
10
+ num_classes: Optional[int] = 1,
11
+ truth_mutex: Optional[bool] = False) -> Feature:
12
+ """Apply forward transform and generate feature data from audio data
7
13
 
14
+ :param audio: Time domain audio data [samples]
15
+ :param feature_mode: Feature mode
16
+ :param num_classes: Number of classes
17
+ :param truth_mutex: Whether to calculate 'other' label
18
+ :return: Feature data [frames, strides, feature_parameters]
19
+ """
8
20
  import numpy as np
9
21
  from pyaaware import FeatureGenerator
10
22
 
11
23
  from .augmentation import pad_audio_to_frame
12
- from .datatypes import FeatureGeneratorConfig
13
24
  from .datatypes import TransformConfig
14
25
  from .helpers import forward_transform
15
- from .truth import truth_reduction
16
26
 
17
- num_classes = 1
18
- truth_mutex = False
19
- truth_reduction_function = 'max'
27
+ fg = FeatureGenerator(feature_mode=feature_mode,
28
+ num_classes=num_classes,
29
+ truth_mutex=truth_mutex)
20
30
 
21
- fg_config = FeatureGeneratorConfig(feature_mode=feature,
22
- num_classes=num_classes,
23
- truth_mutex=truth_mutex)
24
- fg = FeatureGenerator(**asdict(fg_config))
25
31
  feature_step_samples = fg.ftransform_R * fg.decimation * fg.step
26
-
27
32
  audio = pad_audio_to_frame(audio, feature_step_samples)
28
- samples = len(audio)
29
- audio_f = forward_transform(audio, TransformConfig(N=fg.ftransform_N,
33
+
34
+ audio_f = forward_transform(audio=audio,
35
+ config=TransformConfig(N=fg.ftransform_N,
30
36
  R=fg.ftransform_R,
31
37
  bin_start=fg.bin_start,
32
38
  bin_end=fg.bin_end,
33
39
  ttype=fg.ftransform_ttype))
34
40
 
41
+ samples = len(audio)
35
42
  transform_frames = samples // fg.ftransform_R
36
43
  feature_frames = samples // feature_step_samples
37
44
 
38
- truth_t = np.empty((samples, num_classes), dtype=np.float32)
39
-
40
- data = np.empty((feature_frames, fg.stride, fg.num_bands), dtype=np.float32)
45
+ feature = np.empty((feature_frames, fg.stride, fg.feature_parameters), dtype=np.float32)
41
46
 
42
47
  feature_frame = 0
43
48
  for transform_frame in range(transform_frames):
44
- indices = slice(transform_frame * fg.ftransform_R, (transform_frame + 1) * fg.ftransform_R)
45
- fg.execute(audio_f[transform_frame], truth_reduction(truth_t[indices], truth_reduction_function))
49
+ fg.execute(audio_f[transform_frame])
46
50
 
47
51
  if fg.eof():
48
- data[feature_frame] = fg.feature()
52
+ feature[feature_frame] = fg.feature()
49
53
  feature_frame += 1
50
54
 
51
- return data
55
+ return feature
56
+
57
+
58
+ def get_audio_from_feature(feature: Feature,
59
+ feature_mode: str,
60
+ num_classes: Optional[int] = 1,
61
+ truth_mutex: Optional[bool] = False,
62
+ trim: Optional[bool] = True) -> AudioT:
63
+ """Apply inverse transform to feature data to generate audio data
64
+
65
+ :param feature: Feature data [frames, strides, feature_parameters]
66
+ :param feature_mode: Feature mode
67
+ :param num_classes: Number of classes
68
+ :param truth_mutex: Whether to calculate 'other' label
69
+ :param trim: Whether to trim the audio data
70
+ :return: Audio data [samples]
71
+ """
72
+ import numpy as np
73
+
74
+ from pyaaware import FeatureGenerator
75
+
76
+ from .datatypes import TransformConfig
77
+ from .helpers import inverse_transform
78
+ from sonusai.utils.stacked_complex import unstack_complex
79
+
80
+ fg = FeatureGenerator(feature_mode=feature_mode,
81
+ num_classes=num_classes,
82
+ truth_mutex=truth_mutex)
83
+
84
+ feature_complex = unstack_complex(feature)
85
+ if feature_mode[0:1] == 'h':
86
+ feature_complex = _power_uncompress(feature_complex)
87
+ return np.squeeze(inverse_transform(transform=feature_complex,
88
+ config=TransformConfig(N=fg.itransform_N,
89
+ R=fg.itransform_R,
90
+ bin_start=fg.bin_start,
91
+ bin_end=fg.bin_end,
92
+ ttype=fg.itransform_ttype),
93
+ trim=trim))
94
+
95
+
96
+ def _power_uncompress(feature: AudioF) -> AudioF:
97
+ import numpy as np
98
+
99
+ mag = np.abs(feature)
100
+ phase = np.angle(feature)
101
+ mag = mag ** (1. / 0.3)
102
+ real_uncompress = mag * np.cos(phase)
103
+ imag_uncompress = mag * np.sin(phase)
104
+
105
+ return real_uncompress + 1j * imag_uncompress
@@ -1,5 +1,9 @@
1
1
  from typing import Any
2
2
 
3
+ from pyaaware import ForwardTransform
4
+ from pyaaware import InverseTransform
5
+
6
+ from sonusai.mixture import EnergyT
3
7
  from sonusai.mixture.datatypes import AudioF
4
8
  from sonusai.mixture.datatypes import AudioT
5
9
  from sonusai.mixture.datatypes import AudiosT
@@ -78,7 +82,7 @@ def get_feature_generator_info(fg_config: FeatureGeneratorConfig) -> FeatureGene
78
82
  decimation=fg.decimation,
79
83
  stride=fg.stride,
80
84
  step=fg.step,
81
- num_bands=fg.num_bands,
85
+ feature_parameters=fg.feature_parameters,
82
86
  ft_config=TransformConfig(N=fg.ftransform_N,
83
87
  R=fg.ftransform_R,
84
88
  bin_start=fg.bin_start,
@@ -327,15 +331,14 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
327
331
  import numpy as np
328
332
  from pyaaware import FeatureGenerator
329
333
 
330
- from .spectral_mask import apply_spectral_mask
331
334
  from .truth import truth_reduction
332
335
 
333
- mixture_f = get_mixture_f(mixdb=mixdb, mixture_audio=mixture_audio)
336
+ mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
334
337
 
335
338
  transform_frames = mixdb.mixture_transform_frames(mixture.samples)
336
339
  feature_frames = mixdb.mixture_feature_frames(mixture.samples)
337
340
 
338
- feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.fg_num_bands), dtype=np.float32)
341
+ feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.feature_parameters), dtype=np.float32)
339
342
  truth_f = np.empty((feature_frames, mixdb.num_classes), dtype=np.complex64)
340
343
 
341
344
  fg = FeatureGenerator(**asdict(mixdb.fg_config))
@@ -350,11 +353,6 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
350
353
  truth_f[feature_frame] = fg.truth()
351
354
  feature_frame += 1
352
355
 
353
- if mixture.spectral_mask_id is not None:
354
- feature = apply_spectral_mask(feature=feature,
355
- spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
356
- seed=mixture.spectral_mask_seed)
357
-
358
356
  if np.isreal(truth_f).all():
359
357
  return feature, truth_f.real
360
358
 
@@ -444,14 +442,35 @@ def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT)
444
442
  return np.sum(targets_ir, axis=0)
445
443
 
446
444
 
447
- def get_mixture_f(mixdb: MixtureDatabase, mixture_audio: AudioT) -> AudioF:
445
+ def get_mixture_f(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT) -> AudioF:
448
446
  """Get the mixture transform for the given mixture
449
447
 
450
448
  :param mixdb: Mixture database
449
+ :param mixture: Mixture record
451
450
  :param mixture_audio: Mixture audio data for the given mixid
452
451
  :return: Mixture transform data
453
452
  """
454
- return forward_transform(mixture_audio, mixdb.ft_config)
453
+ from .spectral_mask import apply_spectral_mask
454
+
455
+ mixture_f = forward_transform(mixture_audio, mixdb.ft_config)
456
+
457
+ if mixture.spectral_mask_id is not None:
458
+ mixture_f = apply_spectral_mask(audio_f=mixture_f,
459
+ spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
460
+ seed=mixture.spectral_mask_seed)
461
+
462
+ return mixture_f
463
+
464
+
465
+ def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tuple[AudioF, EnergyT]:
466
+ """Apply forward transform to input audio data to generate transform data
467
+
468
+ :param audio: Time domain data [samples]
469
+ :param transform: ForwardTransform object
470
+ :return: Frequency domain data [frames, bins], Energy [frames]
471
+ """
472
+ f, e = transform.execute_all(audio)
473
+ return f.transpose(), e
455
474
 
456
475
 
457
476
  def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
@@ -465,17 +484,30 @@ def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
465
484
  """
466
485
  from pyaaware import AawareForwardTransform
467
486
 
468
- from .audio import calculate_transform_from_audio
469
-
470
- audio_f, _ = calculate_transform_from_audio(audio=audio,
471
- transform=AawareForwardTransform(N=config.N,
472
- R=config.R,
473
- bin_start=config.bin_start,
474
- bin_end=config.bin_end,
475
- ttype=config.ttype))
487
+ audio_f, _ = get_transform_from_audio(audio=audio,
488
+ transform=AawareForwardTransform(N=config.N,
489
+ R=config.R,
490
+ bin_start=config.bin_start,
491
+ bin_end=config.bin_end,
492
+ ttype=config.ttype))
476
493
  return audio_f
477
494
 
478
495
 
496
+ def get_audio_from_transform(data: AudioF, transform: InverseTransform, trim: bool = True) -> tuple[AudioT, EnergyT]:
497
+ """Apply inverse transform to input transform data to generate audio data
498
+
499
+ :param data: Frequency domain data [frames, bins]
500
+ :param transform: InverseTransform object
501
+ :param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the transform
502
+ :return: Time domain data [samples], Energy [frames]
503
+ """
504
+ t, e = transform.execute_all(data.transpose())
505
+ if trim:
506
+ t = t[transform.N - transform.R:]
507
+
508
+ return t, e
509
+
510
+
479
511
  def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = True) -> AudioT:
480
512
  """Transform frequency domain data into time domain using the inverse transform config from the feature
481
513
 
@@ -490,16 +522,14 @@ def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = T
490
522
  import numpy as np
491
523
  from pyaaware import AawareInverseTransform
492
524
 
493
- from .audio import calculate_audio_from_transform
494
-
495
- audio, _ = calculate_audio_from_transform(data=transform,
496
- transform=AawareInverseTransform(N=config.N,
497
- R=config.R,
498
- bin_start=config.bin_start,
499
- bin_end=config.bin_end,
500
- ttype=config.ttype,
501
- gain=np.float32(1)),
502
- trim=trim)
525
+ audio, _ = get_audio_from_transform(data=transform,
526
+ transform=AawareInverseTransform(N=config.N,
527
+ R=config.R,
528
+ bin_start=config.bin_start,
529
+ bin_end=config.bin_end,
530
+ ttype=config.ttype,
531
+ gain=np.float32(1)),
532
+ trim=trim)
503
533
  return audio
504
534
 
505
535
 
@@ -534,7 +564,7 @@ def augmented_target_samples(target_files: TargetFiles,
534
564
  it = list(product(*[target_ids, target_augmentation_ids]))
535
565
  return sum([estimate_augmented_length_from_length(
536
566
  length=target_files[fi].samples,
537
- tempo=target_augmentations[ai].tempo,
567
+ tempo=float(target_augmentations[ai].tempo),
538
568
  frame_length=feature_step_samples) for fi, ai, in it])
539
569
 
540
570
 
@@ -1,7 +1,7 @@
1
1
  def log_duration_and_sizes(total_duration: float,
2
2
  num_classes: int,
3
3
  feature_step_samples: int,
4
- num_bands: int,
4
+ feature_parameters: int,
5
5
  stride: int,
6
6
  desc: str) -> None:
7
7
  from sonusai import logger
@@ -14,7 +14,7 @@ def log_duration_and_sizes(total_duration: float,
14
14
  total_samples = int(total_duration * SAMPLE_RATE)
15
15
  mixture_bytes = total_samples * SAMPLE_BYTES
16
16
  truth_t_bytes = total_samples * num_classes * FLOAT_BYTES
17
- feature_bytes = total_samples / feature_step_samples * stride * num_bands * FLOAT_BYTES
17
+ feature_bytes = total_samples / feature_step_samples * stride * feature_parameters * FLOAT_BYTES
18
18
  truth_f_bytes = total_samples / feature_step_samples * num_classes * FLOAT_BYTES
19
19
 
20
20
  logger.info('')
sonusai/mixture/mixdb.py CHANGED
@@ -248,8 +248,8 @@ class MixtureDatabase:
248
248
  return self.fg_info.step
249
249
 
250
250
  @cached_property
251
- def fg_num_bands(self) -> int:
252
- return self.fg_info.num_bands
251
+ def feature_parameters(self) -> int:
252
+ return self.fg_info.feature_parameters
253
253
 
254
254
  @cached_property
255
255
  def ft_config(self) -> TransformConfig:
@@ -809,11 +809,20 @@ class MixtureDatabase:
809
809
  :return: Mixture transform data
810
810
  """
811
811
  from .helpers import forward_transform
812
+ from .spectral_mask import apply_spectral_mask
812
813
 
813
814
  if force or mixture is None:
814
815
  mixture = self.mixture_mixture(m_id, targets, target, noise, force)
815
816
 
816
- return forward_transform(mixture, self.ft_config)
817
+ mixture_f = forward_transform(mixture, self.ft_config)
818
+
819
+ m = self.mixture(m_id)
820
+ if m.spectral_mask_id is not None:
821
+ mixture_f = apply_spectral_mask(audio_f=mixture_f,
822
+ spectral_mask=self.spectral_mask(int(m.spectral_mask_id)),
823
+ seed=m.spectral_mask_seed)
824
+
825
+ return mixture_f
817
826
 
818
827
  def mixture_truth_t(self,
819
828
  m_id: int,
@@ -938,7 +947,6 @@ class MixtureDatabase:
938
947
  import numpy as np
939
948
  from pyaaware import FeatureGenerator
940
949
 
941
- from .spectral_mask import apply_spectral_mask
942
950
  from .truth import truth_reduction
943
951
 
944
952
  if not force:
@@ -964,7 +972,7 @@ class MixtureDatabase:
964
972
  if truth_t is None:
965
973
  truth_t = np.zeros((m.samples, self.num_classes), dtype=np.float32)
966
974
 
967
- feature = np.empty((feature_frames, self.fg_stride, self.fg_num_bands), dtype=np.float32)
975
+ feature = np.empty((feature_frames, self.fg_stride, self.feature_parameters), dtype=np.float32)
968
976
  truth_f = np.empty((feature_frames, self.num_classes), dtype=np.complex64)
969
977
 
970
978
  fg = FeatureGenerator(**asdict(self.fg_config))
@@ -979,11 +987,6 @@ class MixtureDatabase:
979
987
  truth_f[feature_frame] = fg.truth()
980
988
  feature_frame += 1
981
989
 
982
- if m.spectral_mask_id is not None:
983
- feature = apply_spectral_mask(feature=feature,
984
- spectral_mask=self.spectral_mask(int(m.spectral_mask_id)),
985
- seed=m.spectral_mask_seed)
986
-
987
990
  if np.isreal(truth_f).all():
988
991
  return feature, truth_f.real
989
992
 
@@ -1,23 +1,23 @@
1
- from sonusai.mixture.datatypes import Feature
1
+ from sonusai.mixture.datatypes import AudioF
2
2
  from sonusai.mixture.datatypes import SpectralMask
3
3
 
4
4
 
5
- def apply_spectral_mask(feature: Feature, spectral_mask: SpectralMask, seed: int = None) -> Feature:
5
+ def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int = None) -> AudioF:
6
6
  """Apply frequency and time masking
7
7
 
8
8
  Implementation of SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
9
9
 
10
10
  Ref: https://arxiv.org/pdf/1904.08779.pdf
11
11
 
12
- f_width consecutive bands [f_start, f_start + f_width) are masked, where f_width is chosen from a uniform
13
- distribution from 0 to the f_max_width, and f_start is chosen from [0, bands - f_width).
12
+ f_width consecutive bins [f_start, f_start + f_width) are masked, where f_width is chosen from a uniform
13
+ distribution from 0 to the f_max_width, and f_start is chosen from [0, bins - f_width).
14
14
 
15
15
  t_width consecutive frames [t_start, t_start + t_width) are masked, where t_width is chosen from a uniform
16
16
  distribution from 0 to the t_max_width, and t_start is chosen from [0, frames - t_width).
17
17
 
18
18
  A time mask cannot be wider than t_max_percent times the number of frames.
19
19
 
20
- :param feature: Numpy array of feature data [frames, strides, bands]
20
+ :param audio_f: Numpy array of transform audio data [frames, bins]
21
21
  :param spectral_mask: Spectral mask parameters
22
22
  :param seed: Random number seed
23
23
  :return: Augmented feature
@@ -26,28 +26,28 @@ def apply_spectral_mask(feature: Feature, spectral_mask: SpectralMask, seed: int
26
26
 
27
27
  from sonusai import SonusAIError
28
28
 
29
- if feature.ndim != 3:
30
- raise SonusAIError('feature input must have three dimensions [frames, strides, bands]')
29
+ if audio_f.ndim != 2:
30
+ raise SonusAIError('feature input must have three dimensions [frames, bins]')
31
31
 
32
- frames, strides, bands = feature.shape
32
+ frames, bins = audio_f.shape
33
33
 
34
34
  f_max_width = spectral_mask.f_max_width
35
- if f_max_width not in range(0, bands + 1):
36
- f_max_width = bands
35
+ if f_max_width not in range(0, bins + 1):
36
+ f_max_width = bins
37
37
 
38
38
  rng = np.random.default_rng(seed)
39
39
 
40
40
  # apply f_num frequency masks to the feature
41
41
  for _ in range(spectral_mask.f_num):
42
42
  f_width = int(rng.uniform(0, f_max_width))
43
- f_start = rng.integers(0, bands - f_width, endpoint=True)
44
- feature[:, :, f_start:f_start + f_width] = 0
43
+ f_start = rng.integers(0, bins - f_width, endpoint=True)
44
+ audio_f[:, f_start:f_start + f_width] = 0
45
45
 
46
46
  # apply t_num time masks to the feature
47
47
  t_upper_bound = int(spectral_mask.t_max_percent / 100 * frames)
48
48
  for _ in range(spectral_mask.t_num):
49
49
  t_width = min(int(rng.uniform(0, spectral_mask.t_max_width)), t_upper_bound)
50
50
  t_start = rng.integers(0, frames - t_width, endpoint=True)
51
- feature[t_start:t_start + t_width, :, :] = 0
51
+ audio_f[t_start:t_start + t_width, :] = 0
52
52
 
53
- return feature
53
+ return audio_f
@@ -23,7 +23,7 @@ class Data:
23
23
  num_classes=config.num_classes,
24
24
  truth_mutex=config.mutex)
25
25
 
26
- self.num_bands = fg.num_bands
26
+ self.feature_parameters = fg.feature_parameters
27
27
  self.ttype = fg.ftransform_ttype
28
28
  self.frame_size = fg.ftransform_R
29
29
 
@@ -19,7 +19,7 @@ Output shape: [:, num_classes]
19
19
 
20
20
  from sonusai import SonusAIError
21
21
 
22
- if data.config.num_classes != data.num_bands:
22
+ if data.config.num_classes != data.feature_parameters:
23
23
  raise SonusAIError(f'Invalid num_classes for target_f truth: {data.config.num_classes}')
24
24
 
25
25
  target_freq = _execute_fft(data.target_audio, data.target_fft, len(data.offsets))
@@ -51,7 +51,7 @@ Output shape: [:, 2 * num_classes]
51
51
  """
52
52
  from sonusai import SonusAIError
53
53
 
54
- if data.config.num_classes != 2 * data.num_bands:
54
+ if data.config.num_classes != 2 * data.feature_parameters:
55
55
  raise SonusAIError(f'Invalid num_classes for target_mixture_f truth: {data.config.num_classes}')
56
56
 
57
57
  target_freq = _execute_fft(data.target_audio, data.target_fft, len(data.offsets))
sonusai/mkmanifest.py CHANGED
@@ -30,6 +30,8 @@ Inputs:
30
30
  - 'librispeech'
31
31
  - 'vctk_noisy_speech' expects subdirs named like <name>_wav/ and <name>_txt/ with files in
32
32
  each using same basename, but with .wav and .txt respectively.
33
+ - 'mcgill-speech' expects audio data in basename/speakerid/speakerid-promptid.wav and
34
+ transcript data in Scripts/HarvardLists.dat
33
35
  ADAT Audio data environment variable. All found files will be expanded to their full, absolute path and
34
36
  then parts of the path that match the specified environment variable value will be replaced with
35
37
  the variable. This accommodates portability across platforms where the sound datasets may in
@@ -42,11 +44,11 @@ Outputs the following to the current directory:
42
44
 
43
45
  Example usage for LibriSpeech:
44
46
  sonusai mkmanifest -mlibrispeech -eADAT -oasr_manifest.json --include='*.flac' train-clean-100
45
-
47
+ sonusai mkmanifest -m mcgill-speech -e ADAT -o asr_manifest_16k.json 16k-LP7/
46
48
  """
47
49
  from sonusai import logger
48
50
 
49
- VALID_METHOD = ['librispeech', 'vctk_noisy_speech']
51
+ VALID_METHOD = ['librispeech', 'vctk_noisy_speech', 'mcgill-speech']
50
52
 
51
53
 
52
54
  def main() -> None:
@@ -88,6 +90,7 @@ def main() -> None:
88
90
  from sonusai.utils.asr_manifest_functions import collect_vctk_noisy_speech_transcripts
89
91
  from sonusai.utils.asr_manifest_functions import get_librispeech_manifest_entry
90
92
  from sonusai.utils.asr_manifest_functions import get_vctk_noisy_speech_manifest_entry
93
+ from sonusai.utils.asr_manifest_functions import get_mcgill_speech_manifest_entry
91
94
 
92
95
  start_time = time.monotonic()
93
96
 
@@ -160,6 +163,30 @@ def main() -> None:
160
163
  for result in results:
161
164
  f.write(json.dumps(result) + '\n')
162
165
 
166
+ if method == 'mcgill-speech':
167
+ logger.info(f'Found {len(entries)} Mcgill Speech files, opening prompt file ...')
168
+ # Note expecting only one path pointing to data subdir
169
+ if len(paths) != 1:
170
+ raise SonusAIError(f'mcgill-speech only support a single path')
171
+ prompt_fpath = join(join(realpath(abspath(paths[0]))), '../Scripts/HarvardList.dat')
172
+ with open(prompt_fpath, encoding='utf-8') as f:
173
+ lines = f.readlines()
174
+
175
+ logger.info(f'Found {len(lines) - 4} entries in prompt file.')
176
+ # First 4 lines are header stuff, can use remaining directly with simple lookup
177
+ # example line: '01_02:Glue the sheet ...\n' (paragraph 1, sentence 2)
178
+ # 11 entries per group, so getting line is 11*(p1-1)+(s2-1)
179
+ lines = lines[4:]
180
+
181
+ processing_func = partial(get_mcgill_speech_manifest_entry, transcript_data=lines)
182
+ progress = tqdm(total=len(entries), desc='Creating Mcgill Speech manifest data')
183
+ results = pp_tqdm_imap(processing_func, entries, progress=progress)
184
+ progress.close()
185
+
186
+ with open(output, 'w') as f:
187
+ for result in results:
188
+ f.write(json.dumps(result) + '\n')
189
+
163
190
  end_time = time.monotonic()
164
191
  logger.info('')
165
192
  logger.info(f'Completed in {seconds_to_hms(seconds=end_time - start_time)}')
sonusai/onnx_predict.py CHANGED
@@ -105,7 +105,7 @@ def main() -> None:
105
105
  logger.info('')
106
106
  logger.info(f'Run prediction on {input_name}')
107
107
  audio = read_audio(input_name)
108
- feature = get_feature_from_audio(audio=audio, feature=model_metadata.feature)
108
+ feature = get_feature_from_audio(audio=audio, feature_mode=model_metadata.feature)
109
109
 
110
110
  predict = pad_and_predict(feature=feature,
111
111
  model_name=model_name,
sonusai/plot.py CHANGED
@@ -314,7 +314,7 @@ def main() -> None:
314
314
  raise SonusAIError('Must specify MODEL when input is WAV')
315
315
 
316
316
  mixture_audio = read_audio(input_name)
317
- feature = get_feature_from_audio(audio=mixture_audio, feature=model.feature)
317
+ feature = get_feature_from_audio(audio=mixture_audio, feature_mode=model.feature)
318
318
  fg_config = FeatureGeneratorConfig(feature_mode=model.feature,
319
319
  num_classes=model.output_shape[-1],
320
320
  truth_mutex=False)
@@ -406,11 +406,11 @@ def main() -> None:
406
406
  title = f'{input_name}'
407
407
  pdf_name = f'{base_name}-plot.pdf'
408
408
 
409
- # Original size [frames, stride, num_bands]
409
+ # Original size [frames, stride, feature_parameters]
410
410
  # Decimate in the stride dimension
411
- # Reshape to get frames*decimated_stride, num_bands
411
+ # Reshape to get frames*decimated_stride, feature_parameters
412
412
  if feature.ndim != 3:
413
- raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, num_bands')
413
+ raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, feature_parameters')
414
414
  spectrogram = feature[:, -fg_step:, :]
415
415
  spectrogram = np.reshape(spectrogram, (spectrogram.shape[0] * spectrogram.shape[1], spectrogram.shape[2]))
416
416
 
@@ -123,7 +123,7 @@ def _process(file: str) -> None:
123
123
  from pyaaware import AawareInverseTransform
124
124
 
125
125
  from sonusai import SonusAIError
126
- from sonusai.mixture import calculate_audio_from_transform
126
+ from sonusai.mixture import get_audio_from_transform
127
127
  from sonusai.utils import float_to_int16
128
128
  from sonusai.utils import unstack_complex
129
129
  from sonusai.utils import write_wav
@@ -135,13 +135,13 @@ def _process(file: str) -> None:
135
135
  raise SonusAIError(f'Error reading {file}: {e}')
136
136
 
137
137
  output_name = join(MP_GLOBAL.output_dir, splitext(basename(file))[0] + '.wav')
138
- audio, _ = calculate_audio_from_transform(data=predict,
139
- transform=AawareInverseTransform(N=MP_GLOBAL.N,
140
- R=MP_GLOBAL.R,
141
- bin_start=MP_GLOBAL.bin_start,
142
- bin_end=MP_GLOBAL.bin_end,
143
- ttype=MP_GLOBAL.ttype,
144
- gain=np.float32(1)))
138
+ audio, _ = get_audio_from_transform(data=predict,
139
+ transform=AawareInverseTransform(N=MP_GLOBAL.N,
140
+ R=MP_GLOBAL.R,
141
+ bin_start=MP_GLOBAL.bin_start,
142
+ bin_end=MP_GLOBAL.bin_end,
143
+ ttype=MP_GLOBAL.ttype,
144
+ gain=np.float32(1)))
145
145
  write_wav(name=output_name, audio=float_to_int16(audio))
146
146
 
147
147
 
sonusai/utils/__init__.py CHANGED
@@ -1,33 +1,35 @@
1
1
  # SonusAI general utilities
2
2
  from .asl_p56 import asl_p56
3
+ from .asr import ASRData
3
4
  from .asr import ASRResult
4
5
  from .asr import calc_asr
6
+ from .audio_devices import get_default_input_device
7
+ from .audio_devices import get_input_device_index_by_name
8
+ from .audio_devices import get_input_devices
5
9
  from .braced_glob import braced_glob
6
10
  from .braced_glob import braced_iglob
7
11
  from .calculate_input_shape import calculate_input_shape
8
12
  from .convert_string_to_number import convert_string_to_number
13
+ from .create_timestamp import create_timestamp
9
14
  from .create_ts_name import create_ts_name
10
15
  from .dataclass_from_dict import dataclass_from_dict
11
16
  from .db import db_to_linear
12
17
  from .db import linear_to_db
18
+ from .docstring import add_commands_to_docstring
19
+ from .docstring import trim_docstring
13
20
  from .energy_f import compute_energy_f
14
21
  from .engineering_number import EngineeringNumber
15
22
  from .get_frames_per_batch import get_frames_per_batch
16
23
  from .get_label_names import get_label_names
17
24
  from .grouper import grouper
18
25
  from .human_readable_size import human_readable_size
19
- from .keras_utils import check_keras_overrides
20
- from .keras_utils import create_onnx_from_keras
21
- from .keras_utils import import_and_check_keras_model
22
- from .keras_utils import import_keras_model
23
- from .keras_utils import keras_onnx
24
26
  from .max_text_width import max_text_width
27
+ from .model_utils import import_module
25
28
  from .numeric_conversion import float_to_int16
26
29
  from .numeric_conversion import int16_to_float
27
30
  from .onnx_utils import SonusAIMetaData
28
31
  from .onnx_utils import add_sonusai_metadata
29
32
  from .onnx_utils import get_sonusai_metadata
30
- from .onnx_utils import replace_stateful_grus
31
33
  from .parallel import pp_imap
32
34
  from .parallel import pp_tqdm_imap
33
35
  from .print_mixture_details import print_class_count
@@ -46,6 +48,5 @@ from .stacked_complex import stacked_complex_imag
46
48
  from .stacked_complex import stacked_complex_real
47
49
  from .stacked_complex import unstack_complex
48
50
  from .stratified_shuffle_split import stratified_shuffle_split_mixid
49
- from .trim_docstring import trim_docstring
50
51
  from .wave import write_wav
51
52
  from .yes_or_no import yes_or_no