sonusai 0.15.8__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +35 -4
- sonusai/audiofe.py +237 -0
- sonusai/calc_metric_spenh.py +21 -12
- sonusai/genft.py +2 -1
- sonusai/genmixdb.py +5 -5
- sonusai/lsdb.py +2 -2
- sonusai/main.py +58 -61
- sonusai/mixture/__init__.py +4 -2
- sonusai/mixture/audio.py +0 -34
- sonusai/mixture/config.py +1 -2
- sonusai/mixture/datatypes.py +1 -1
- sonusai/mixture/feature.py +75 -21
- sonusai/mixture/helpers.py +60 -30
- sonusai/mixture/log_duration_and_sizes.py +2 -2
- sonusai/mixture/mixdb.py +13 -10
- sonusai/mixture/spectral_mask.py +14 -14
- sonusai/mixture/truth_functions/data.py +1 -1
- sonusai/mixture/truth_functions/target.py +2 -2
- sonusai/mkmanifest.py +29 -2
- sonusai/onnx_predict.py +1 -1
- sonusai/plot.py +4 -4
- sonusai/post_spenh_targetf.py +8 -8
- sonusai/utils/__init__.py +8 -7
- sonusai/utils/asl_p56.py +3 -3
- sonusai/utils/asr.py +35 -8
- sonusai/utils/asr_functions/__init__.py +0 -5
- sonusai/utils/asr_functions/aaware_whisper.py +2 -2
- sonusai/utils/asr_manifest_functions/__init__.py +1 -0
- sonusai/utils/asr_manifest_functions/mcgill_speech.py +29 -0
- sonusai/utils/audio_devices.py +41 -0
- sonusai/utils/calculate_input_shape.py +3 -4
- sonusai/utils/create_timestamp.py +5 -0
- sonusai/utils/{trim_docstring.py → docstring.py} +20 -0
- sonusai/utils/model_utils.py +30 -0
- sonusai/utils/onnx_utils.py +19 -45
- sonusai/utils/reshape.py +11 -11
- sonusai/utils/wave.py +12 -5
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/METADATA +8 -19
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/RECORD +41 -54
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/WHEEL +1 -1
- sonusai/data_generator/__init__.py +0 -5
- sonusai/data_generator/dataset_from_mixdb.py +0 -143
- sonusai/data_generator/keras_from_mixdb.py +0 -169
- sonusai/data_generator/torch_from_mixdb.py +0 -122
- sonusai/evaluate.py +0 -245
- sonusai/keras_onnx.py +0 -86
- sonusai/keras_predict.py +0 -231
- sonusai/keras_train.py +0 -334
- sonusai/torchl_onnx.py +0 -216
- sonusai/torchl_predict.py +0 -547
- sonusai/torchl_train.py +0 -223
- sonusai/utils/asr_functions/aixplain_whisper.py +0 -59
- sonusai/utils/asr_functions/data.py +0 -16
- sonusai/utils/asr_functions/deepgram.py +0 -97
- sonusai/utils/asr_functions/fastwhisper.py +0 -90
- sonusai/utils/asr_functions/google.py +0 -95
- sonusai/utils/asr_functions/whisper.py +0 -49
- sonusai/utils/keras_utils.py +0 -226
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/entry_points.txt +0 -0
sonusai/mixture/config.py
CHANGED
@@ -480,11 +480,10 @@ def append_noise_files(entry: dict | str, tokens: dict = None) -> list[dict]:
|
|
480
480
|
return noise_files
|
481
481
|
|
482
482
|
|
483
|
-
def get_impulse_response_files(config: dict
|
483
|
+
def get_impulse_response_files(config: dict) -> ImpulseResponseFiles:
|
484
484
|
"""Get the list of impulse response files from a config
|
485
485
|
|
486
486
|
:param config: Config dictionary
|
487
|
-
:param show_progress: Show progress bar
|
488
487
|
:return: List of impulse response files
|
489
488
|
"""
|
490
489
|
from itertools import chain
|
sonusai/mixture/datatypes.py
CHANGED
sonusai/mixture/feature.py
CHANGED
@@ -1,51 +1,105 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from sonusai.mixture.datatypes import AudioF
|
1
4
|
from sonusai.mixture.datatypes import AudioT
|
2
5
|
from sonusai.mixture.datatypes import Feature
|
3
6
|
|
4
7
|
|
5
|
-
def get_feature_from_audio(audio: AudioT,
|
6
|
-
|
8
|
+
def get_feature_from_audio(audio: AudioT,
|
9
|
+
feature_mode: str,
|
10
|
+
num_classes: Optional[int] = 1,
|
11
|
+
truth_mutex: Optional[bool] = False) -> Feature:
|
12
|
+
"""Apply forward transform and generate feature data from audio data
|
7
13
|
|
14
|
+
:param audio: Time domain audio data [samples]
|
15
|
+
:param feature_mode: Feature mode
|
16
|
+
:param num_classes: Number of classes
|
17
|
+
:param truth_mutex: Whether to calculate 'other' label
|
18
|
+
:return: Feature data [frames, strides, feature_parameters]
|
19
|
+
"""
|
8
20
|
import numpy as np
|
9
21
|
from pyaaware import FeatureGenerator
|
10
22
|
|
11
23
|
from .augmentation import pad_audio_to_frame
|
12
|
-
from .datatypes import FeatureGeneratorConfig
|
13
24
|
from .datatypes import TransformConfig
|
14
25
|
from .helpers import forward_transform
|
15
|
-
from .truth import truth_reduction
|
16
26
|
|
17
|
-
|
18
|
-
|
19
|
-
|
27
|
+
fg = FeatureGenerator(feature_mode=feature_mode,
|
28
|
+
num_classes=num_classes,
|
29
|
+
truth_mutex=truth_mutex)
|
20
30
|
|
21
|
-
fg_config = FeatureGeneratorConfig(feature_mode=feature,
|
22
|
-
num_classes=num_classes,
|
23
|
-
truth_mutex=truth_mutex)
|
24
|
-
fg = FeatureGenerator(**asdict(fg_config))
|
25
31
|
feature_step_samples = fg.ftransform_R * fg.decimation * fg.step
|
26
|
-
|
27
32
|
audio = pad_audio_to_frame(audio, feature_step_samples)
|
28
|
-
|
29
|
-
audio_f = forward_transform(audio
|
33
|
+
|
34
|
+
audio_f = forward_transform(audio=audio,
|
35
|
+
config=TransformConfig(N=fg.ftransform_N,
|
30
36
|
R=fg.ftransform_R,
|
31
37
|
bin_start=fg.bin_start,
|
32
38
|
bin_end=fg.bin_end,
|
33
39
|
ttype=fg.ftransform_ttype))
|
34
40
|
|
41
|
+
samples = len(audio)
|
35
42
|
transform_frames = samples // fg.ftransform_R
|
36
43
|
feature_frames = samples // feature_step_samples
|
37
44
|
|
38
|
-
|
39
|
-
|
40
|
-
data = np.empty((feature_frames, fg.stride, fg.num_bands), dtype=np.float32)
|
45
|
+
feature = np.empty((feature_frames, fg.stride, fg.feature_parameters), dtype=np.float32)
|
41
46
|
|
42
47
|
feature_frame = 0
|
43
48
|
for transform_frame in range(transform_frames):
|
44
|
-
|
45
|
-
fg.execute(audio_f[transform_frame], truth_reduction(truth_t[indices], truth_reduction_function))
|
49
|
+
fg.execute(audio_f[transform_frame])
|
46
50
|
|
47
51
|
if fg.eof():
|
48
|
-
|
52
|
+
feature[feature_frame] = fg.feature()
|
49
53
|
feature_frame += 1
|
50
54
|
|
51
|
-
return
|
55
|
+
return feature
|
56
|
+
|
57
|
+
|
58
|
+
def get_audio_from_feature(feature: Feature,
|
59
|
+
feature_mode: str,
|
60
|
+
num_classes: Optional[int] = 1,
|
61
|
+
truth_mutex: Optional[bool] = False,
|
62
|
+
trim: Optional[bool] = True) -> AudioT:
|
63
|
+
"""Apply inverse transform to feature data to generate audio data
|
64
|
+
|
65
|
+
:param feature: Feature data [frames, strides, feature_parameters]
|
66
|
+
:param feature_mode: Feature mode
|
67
|
+
:param num_classes: Number of classes
|
68
|
+
:param truth_mutex: Whether to calculate 'other' label
|
69
|
+
:param trim: Whether to trim the audio data
|
70
|
+
:return: Audio data [samples]
|
71
|
+
"""
|
72
|
+
import numpy as np
|
73
|
+
|
74
|
+
from pyaaware import FeatureGenerator
|
75
|
+
|
76
|
+
from .datatypes import TransformConfig
|
77
|
+
from .helpers import inverse_transform
|
78
|
+
from sonusai.utils.stacked_complex import unstack_complex
|
79
|
+
|
80
|
+
fg = FeatureGenerator(feature_mode=feature_mode,
|
81
|
+
num_classes=num_classes,
|
82
|
+
truth_mutex=truth_mutex)
|
83
|
+
|
84
|
+
feature_complex = unstack_complex(feature)
|
85
|
+
if feature_mode[0:1] == 'h':
|
86
|
+
feature_complex = _power_uncompress(feature_complex)
|
87
|
+
return np.squeeze(inverse_transform(transform=feature_complex,
|
88
|
+
config=TransformConfig(N=fg.itransform_N,
|
89
|
+
R=fg.itransform_R,
|
90
|
+
bin_start=fg.bin_start,
|
91
|
+
bin_end=fg.bin_end,
|
92
|
+
ttype=fg.itransform_ttype),
|
93
|
+
trim=trim))
|
94
|
+
|
95
|
+
|
96
|
+
def _power_uncompress(feature: AudioF) -> AudioF:
|
97
|
+
import numpy as np
|
98
|
+
|
99
|
+
mag = np.abs(feature)
|
100
|
+
phase = np.angle(feature)
|
101
|
+
mag = mag ** (1. / 0.3)
|
102
|
+
real_uncompress = mag * np.cos(phase)
|
103
|
+
imag_uncompress = mag * np.sin(phase)
|
104
|
+
|
105
|
+
return real_uncompress + 1j * imag_uncompress
|
sonusai/mixture/helpers.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
from typing import Any
|
2
2
|
|
3
|
+
from pyaaware import ForwardTransform
|
4
|
+
from pyaaware import InverseTransform
|
5
|
+
|
6
|
+
from sonusai.mixture import EnergyT
|
3
7
|
from sonusai.mixture.datatypes import AudioF
|
4
8
|
from sonusai.mixture.datatypes import AudioT
|
5
9
|
from sonusai.mixture.datatypes import AudiosT
|
@@ -78,7 +82,7 @@ def get_feature_generator_info(fg_config: FeatureGeneratorConfig) -> FeatureGene
|
|
78
82
|
decimation=fg.decimation,
|
79
83
|
stride=fg.stride,
|
80
84
|
step=fg.step,
|
81
|
-
|
85
|
+
feature_parameters=fg.feature_parameters,
|
82
86
|
ft_config=TransformConfig(N=fg.ftransform_N,
|
83
87
|
R=fg.ftransform_R,
|
84
88
|
bin_start=fg.bin_start,
|
@@ -327,15 +331,14 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
|
|
327
331
|
import numpy as np
|
328
332
|
from pyaaware import FeatureGenerator
|
329
333
|
|
330
|
-
from .spectral_mask import apply_spectral_mask
|
331
334
|
from .truth import truth_reduction
|
332
335
|
|
333
|
-
mixture_f = get_mixture_f(mixdb=mixdb, mixture_audio=mixture_audio)
|
336
|
+
mixture_f = get_mixture_f(mixdb=mixdb, mixture=mixture, mixture_audio=mixture_audio)
|
334
337
|
|
335
338
|
transform_frames = mixdb.mixture_transform_frames(mixture.samples)
|
336
339
|
feature_frames = mixdb.mixture_feature_frames(mixture.samples)
|
337
340
|
|
338
|
-
feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.
|
341
|
+
feature = np.empty((feature_frames, mixdb.fg_stride, mixdb.feature_parameters), dtype=np.float32)
|
339
342
|
truth_f = np.empty((feature_frames, mixdb.num_classes), dtype=np.complex64)
|
340
343
|
|
341
344
|
fg = FeatureGenerator(**asdict(mixdb.fg_config))
|
@@ -350,11 +353,6 @@ def get_ft(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT, trut
|
|
350
353
|
truth_f[feature_frame] = fg.truth()
|
351
354
|
feature_frame += 1
|
352
355
|
|
353
|
-
if mixture.spectral_mask_id is not None:
|
354
|
-
feature = apply_spectral_mask(feature=feature,
|
355
|
-
spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
|
356
|
-
seed=mixture.spectral_mask_seed)
|
357
|
-
|
358
356
|
if np.isreal(truth_f).all():
|
359
357
|
return feature, truth_f.real
|
360
358
|
|
@@ -444,14 +442,35 @@ def get_target(mixdb: MixtureDatabase, mixture: Mixture, targets_audio: AudiosT)
|
|
444
442
|
return np.sum(targets_ir, axis=0)
|
445
443
|
|
446
444
|
|
447
|
-
def get_mixture_f(mixdb: MixtureDatabase, mixture_audio: AudioT) -> AudioF:
|
445
|
+
def get_mixture_f(mixdb: MixtureDatabase, mixture: Mixture, mixture_audio: AudioT) -> AudioF:
|
448
446
|
"""Get the mixture transform for the given mixture
|
449
447
|
|
450
448
|
:param mixdb: Mixture database
|
449
|
+
:param mixture: Mixture record
|
451
450
|
:param mixture_audio: Mixture audio data for the given mixid
|
452
451
|
:return: Mixture transform data
|
453
452
|
"""
|
454
|
-
|
453
|
+
from .spectral_mask import apply_spectral_mask
|
454
|
+
|
455
|
+
mixture_f = forward_transform(mixture_audio, mixdb.ft_config)
|
456
|
+
|
457
|
+
if mixture.spectral_mask_id is not None:
|
458
|
+
mixture_f = apply_spectral_mask(audio_f=mixture_f,
|
459
|
+
spectral_mask=mixdb.spectral_mask(mixture.spectral_mask_id),
|
460
|
+
seed=mixture.spectral_mask_seed)
|
461
|
+
|
462
|
+
return mixture_f
|
463
|
+
|
464
|
+
|
465
|
+
def get_transform_from_audio(audio: AudioT, transform: ForwardTransform) -> tuple[AudioF, EnergyT]:
|
466
|
+
"""Apply forward transform to input audio data to generate transform data
|
467
|
+
|
468
|
+
:param audio: Time domain data [samples]
|
469
|
+
:param transform: ForwardTransform object
|
470
|
+
:return: Frequency domain data [frames, bins], Energy [frames]
|
471
|
+
"""
|
472
|
+
f, e = transform.execute_all(audio)
|
473
|
+
return f.transpose(), e
|
455
474
|
|
456
475
|
|
457
476
|
def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
|
@@ -465,17 +484,30 @@ def forward_transform(audio: AudioT, config: TransformConfig) -> AudioF:
|
|
465
484
|
"""
|
466
485
|
from pyaaware import AawareForwardTransform
|
467
486
|
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
bin_end=config.bin_end,
|
475
|
-
ttype=config.ttype))
|
487
|
+
audio_f, _ = get_transform_from_audio(audio=audio,
|
488
|
+
transform=AawareForwardTransform(N=config.N,
|
489
|
+
R=config.R,
|
490
|
+
bin_start=config.bin_start,
|
491
|
+
bin_end=config.bin_end,
|
492
|
+
ttype=config.ttype))
|
476
493
|
return audio_f
|
477
494
|
|
478
495
|
|
496
|
+
def get_audio_from_transform(data: AudioF, transform: InverseTransform, trim: bool = True) -> tuple[AudioT, EnergyT]:
|
497
|
+
"""Apply inverse transform to input transform data to generate audio data
|
498
|
+
|
499
|
+
:param data: Frequency domain data [frames, bins]
|
500
|
+
:param transform: InverseTransform object
|
501
|
+
:param trim: Removes starting samples so output waveform will be time-aligned with input waveform to the transform
|
502
|
+
:return: Time domain data [samples], Energy [frames]
|
503
|
+
"""
|
504
|
+
t, e = transform.execute_all(data.transpose())
|
505
|
+
if trim:
|
506
|
+
t = t[transform.N - transform.R:]
|
507
|
+
|
508
|
+
return t, e
|
509
|
+
|
510
|
+
|
479
511
|
def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = True) -> AudioT:
|
480
512
|
"""Transform frequency domain data into time domain using the inverse transform config from the feature
|
481
513
|
|
@@ -490,16 +522,14 @@ def inverse_transform(transform: AudioF, config: TransformConfig, trim: bool = T
|
|
490
522
|
import numpy as np
|
491
523
|
from pyaaware import AawareInverseTransform
|
492
524
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
gain=np.float32(1)),
|
502
|
-
trim=trim)
|
525
|
+
audio, _ = get_audio_from_transform(data=transform,
|
526
|
+
transform=AawareInverseTransform(N=config.N,
|
527
|
+
R=config.R,
|
528
|
+
bin_start=config.bin_start,
|
529
|
+
bin_end=config.bin_end,
|
530
|
+
ttype=config.ttype,
|
531
|
+
gain=np.float32(1)),
|
532
|
+
trim=trim)
|
503
533
|
return audio
|
504
534
|
|
505
535
|
|
@@ -534,7 +564,7 @@ def augmented_target_samples(target_files: TargetFiles,
|
|
534
564
|
it = list(product(*[target_ids, target_augmentation_ids]))
|
535
565
|
return sum([estimate_augmented_length_from_length(
|
536
566
|
length=target_files[fi].samples,
|
537
|
-
tempo=target_augmentations[ai].tempo,
|
567
|
+
tempo=float(target_augmentations[ai].tempo),
|
538
568
|
frame_length=feature_step_samples) for fi, ai, in it])
|
539
569
|
|
540
570
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
def log_duration_and_sizes(total_duration: float,
|
2
2
|
num_classes: int,
|
3
3
|
feature_step_samples: int,
|
4
|
-
|
4
|
+
feature_parameters: int,
|
5
5
|
stride: int,
|
6
6
|
desc: str) -> None:
|
7
7
|
from sonusai import logger
|
@@ -14,7 +14,7 @@ def log_duration_and_sizes(total_duration: float,
|
|
14
14
|
total_samples = int(total_duration * SAMPLE_RATE)
|
15
15
|
mixture_bytes = total_samples * SAMPLE_BYTES
|
16
16
|
truth_t_bytes = total_samples * num_classes * FLOAT_BYTES
|
17
|
-
feature_bytes = total_samples / feature_step_samples * stride *
|
17
|
+
feature_bytes = total_samples / feature_step_samples * stride * feature_parameters * FLOAT_BYTES
|
18
18
|
truth_f_bytes = total_samples / feature_step_samples * num_classes * FLOAT_BYTES
|
19
19
|
|
20
20
|
logger.info('')
|
sonusai/mixture/mixdb.py
CHANGED
@@ -248,8 +248,8 @@ class MixtureDatabase:
|
|
248
248
|
return self.fg_info.step
|
249
249
|
|
250
250
|
@cached_property
|
251
|
-
def
|
252
|
-
return self.fg_info.
|
251
|
+
def feature_parameters(self) -> int:
|
252
|
+
return self.fg_info.feature_parameters
|
253
253
|
|
254
254
|
@cached_property
|
255
255
|
def ft_config(self) -> TransformConfig:
|
@@ -809,11 +809,20 @@ class MixtureDatabase:
|
|
809
809
|
:return: Mixture transform data
|
810
810
|
"""
|
811
811
|
from .helpers import forward_transform
|
812
|
+
from .spectral_mask import apply_spectral_mask
|
812
813
|
|
813
814
|
if force or mixture is None:
|
814
815
|
mixture = self.mixture_mixture(m_id, targets, target, noise, force)
|
815
816
|
|
816
|
-
|
817
|
+
mixture_f = forward_transform(mixture, self.ft_config)
|
818
|
+
|
819
|
+
m = self.mixture(m_id)
|
820
|
+
if m.spectral_mask_id is not None:
|
821
|
+
mixture_f = apply_spectral_mask(audio_f=mixture_f,
|
822
|
+
spectral_mask=self.spectral_mask(int(m.spectral_mask_id)),
|
823
|
+
seed=m.spectral_mask_seed)
|
824
|
+
|
825
|
+
return mixture_f
|
817
826
|
|
818
827
|
def mixture_truth_t(self,
|
819
828
|
m_id: int,
|
@@ -938,7 +947,6 @@ class MixtureDatabase:
|
|
938
947
|
import numpy as np
|
939
948
|
from pyaaware import FeatureGenerator
|
940
949
|
|
941
|
-
from .spectral_mask import apply_spectral_mask
|
942
950
|
from .truth import truth_reduction
|
943
951
|
|
944
952
|
if not force:
|
@@ -964,7 +972,7 @@ class MixtureDatabase:
|
|
964
972
|
if truth_t is None:
|
965
973
|
truth_t = np.zeros((m.samples, self.num_classes), dtype=np.float32)
|
966
974
|
|
967
|
-
feature = np.empty((feature_frames, self.fg_stride, self.
|
975
|
+
feature = np.empty((feature_frames, self.fg_stride, self.feature_parameters), dtype=np.float32)
|
968
976
|
truth_f = np.empty((feature_frames, self.num_classes), dtype=np.complex64)
|
969
977
|
|
970
978
|
fg = FeatureGenerator(**asdict(self.fg_config))
|
@@ -979,11 +987,6 @@ class MixtureDatabase:
|
|
979
987
|
truth_f[feature_frame] = fg.truth()
|
980
988
|
feature_frame += 1
|
981
989
|
|
982
|
-
if m.spectral_mask_id is not None:
|
983
|
-
feature = apply_spectral_mask(feature=feature,
|
984
|
-
spectral_mask=self.spectral_mask(int(m.spectral_mask_id)),
|
985
|
-
seed=m.spectral_mask_seed)
|
986
|
-
|
987
990
|
if np.isreal(truth_f).all():
|
988
991
|
return feature, truth_f.real
|
989
992
|
|
sonusai/mixture/spectral_mask.py
CHANGED
@@ -1,23 +1,23 @@
|
|
1
|
-
from sonusai.mixture.datatypes import
|
1
|
+
from sonusai.mixture.datatypes import AudioF
|
2
2
|
from sonusai.mixture.datatypes import SpectralMask
|
3
3
|
|
4
4
|
|
5
|
-
def apply_spectral_mask(
|
5
|
+
def apply_spectral_mask(audio_f: AudioF, spectral_mask: SpectralMask, seed: int = None) -> AudioF:
|
6
6
|
"""Apply frequency and time masking
|
7
7
|
|
8
8
|
Implementation of SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
9
9
|
|
10
10
|
Ref: https://arxiv.org/pdf/1904.08779.pdf
|
11
11
|
|
12
|
-
f_width consecutive
|
13
|
-
distribution from 0 to the f_max_width, and f_start is chosen from [0,
|
12
|
+
f_width consecutive bins [f_start, f_start + f_width) are masked, where f_width is chosen from a uniform
|
13
|
+
distribution from 0 to the f_max_width, and f_start is chosen from [0, bins - f_width).
|
14
14
|
|
15
15
|
t_width consecutive frames [t_start, t_start + t_width) are masked, where t_width is chosen from a uniform
|
16
16
|
distribution from 0 to the t_max_width, and t_start is chosen from [0, frames - t_width).
|
17
17
|
|
18
18
|
A time mask cannot be wider than t_max_percent times the number of frames.
|
19
19
|
|
20
|
-
:param
|
20
|
+
:param audio_f: Numpy array of transform audio data [frames, bins]
|
21
21
|
:param spectral_mask: Spectral mask parameters
|
22
22
|
:param seed: Random number seed
|
23
23
|
:return: Augmented feature
|
@@ -26,28 +26,28 @@ def apply_spectral_mask(feature: Feature, spectral_mask: SpectralMask, seed: int
|
|
26
26
|
|
27
27
|
from sonusai import SonusAIError
|
28
28
|
|
29
|
-
if
|
30
|
-
raise SonusAIError('feature input must have three dimensions [frames,
|
29
|
+
if audio_f.ndim != 2:
|
30
|
+
raise SonusAIError('feature input must have three dimensions [frames, bins]')
|
31
31
|
|
32
|
-
frames,
|
32
|
+
frames, bins = audio_f.shape
|
33
33
|
|
34
34
|
f_max_width = spectral_mask.f_max_width
|
35
|
-
if f_max_width not in range(0,
|
36
|
-
f_max_width =
|
35
|
+
if f_max_width not in range(0, bins + 1):
|
36
|
+
f_max_width = bins
|
37
37
|
|
38
38
|
rng = np.random.default_rng(seed)
|
39
39
|
|
40
40
|
# apply f_num frequency masks to the feature
|
41
41
|
for _ in range(spectral_mask.f_num):
|
42
42
|
f_width = int(rng.uniform(0, f_max_width))
|
43
|
-
f_start = rng.integers(0,
|
44
|
-
|
43
|
+
f_start = rng.integers(0, bins - f_width, endpoint=True)
|
44
|
+
audio_f[:, f_start:f_start + f_width] = 0
|
45
45
|
|
46
46
|
# apply t_num time masks to the feature
|
47
47
|
t_upper_bound = int(spectral_mask.t_max_percent / 100 * frames)
|
48
48
|
for _ in range(spectral_mask.t_num):
|
49
49
|
t_width = min(int(rng.uniform(0, spectral_mask.t_max_width)), t_upper_bound)
|
50
50
|
t_start = rng.integers(0, frames - t_width, endpoint=True)
|
51
|
-
|
51
|
+
audio_f[t_start:t_start + t_width, :] = 0
|
52
52
|
|
53
|
-
return
|
53
|
+
return audio_f
|
@@ -19,7 +19,7 @@ Output shape: [:, num_classes]
|
|
19
19
|
|
20
20
|
from sonusai import SonusAIError
|
21
21
|
|
22
|
-
if data.config.num_classes != data.
|
22
|
+
if data.config.num_classes != data.feature_parameters:
|
23
23
|
raise SonusAIError(f'Invalid num_classes for target_f truth: {data.config.num_classes}')
|
24
24
|
|
25
25
|
target_freq = _execute_fft(data.target_audio, data.target_fft, len(data.offsets))
|
@@ -51,7 +51,7 @@ Output shape: [:, 2 * num_classes]
|
|
51
51
|
"""
|
52
52
|
from sonusai import SonusAIError
|
53
53
|
|
54
|
-
if data.config.num_classes != 2 * data.
|
54
|
+
if data.config.num_classes != 2 * data.feature_parameters:
|
55
55
|
raise SonusAIError(f'Invalid num_classes for target_mixture_f truth: {data.config.num_classes}')
|
56
56
|
|
57
57
|
target_freq = _execute_fft(data.target_audio, data.target_fft, len(data.offsets))
|
sonusai/mkmanifest.py
CHANGED
@@ -30,6 +30,8 @@ Inputs:
|
|
30
30
|
- 'librispeech'
|
31
31
|
- 'vctk_noisy_speech' expects subdirs named like <name>_wav/ and <name>_txt/ with files in
|
32
32
|
each using same basename, but with .wav and .txt respectively.
|
33
|
+
- 'mcgill-speech' expects audio data in basename/speakerid/speakerid-promptid.wav and
|
34
|
+
transcript data in Scripts/HarvardLists.dat
|
33
35
|
ADAT Audio data environment variable. All found files will be expanded to their full, absolute path and
|
34
36
|
then parts of the path that match the specified environment variable value will be replaced with
|
35
37
|
the variable. This accommodates portability across platforms where the sound datasets may in
|
@@ -42,11 +44,11 @@ Outputs the following to the current directory:
|
|
42
44
|
|
43
45
|
Example usage for LibriSpeech:
|
44
46
|
sonusai mkmanifest -mlibrispeech -eADAT -oasr_manifest.json --include='*.flac' train-clean-100
|
45
|
-
|
47
|
+
sonusai mkmanifest -m mcgill-speech -e ADAT -o asr_manifest_16k.json 16k-LP7/
|
46
48
|
"""
|
47
49
|
from sonusai import logger
|
48
50
|
|
49
|
-
VALID_METHOD = ['librispeech', 'vctk_noisy_speech']
|
51
|
+
VALID_METHOD = ['librispeech', 'vctk_noisy_speech', 'mcgill-speech']
|
50
52
|
|
51
53
|
|
52
54
|
def main() -> None:
|
@@ -88,6 +90,7 @@ def main() -> None:
|
|
88
90
|
from sonusai.utils.asr_manifest_functions import collect_vctk_noisy_speech_transcripts
|
89
91
|
from sonusai.utils.asr_manifest_functions import get_librispeech_manifest_entry
|
90
92
|
from sonusai.utils.asr_manifest_functions import get_vctk_noisy_speech_manifest_entry
|
93
|
+
from sonusai.utils.asr_manifest_functions import get_mcgill_speech_manifest_entry
|
91
94
|
|
92
95
|
start_time = time.monotonic()
|
93
96
|
|
@@ -160,6 +163,30 @@ def main() -> None:
|
|
160
163
|
for result in results:
|
161
164
|
f.write(json.dumps(result) + '\n')
|
162
165
|
|
166
|
+
if method == 'mcgill-speech':
|
167
|
+
logger.info(f'Found {len(entries)} Mcgill Speech files, opening prompt file ...')
|
168
|
+
# Note expecting only one path pointing to data subdir
|
169
|
+
if len(paths) != 1:
|
170
|
+
raise SonusAIError(f'mcgill-speech only support a single path')
|
171
|
+
prompt_fpath = join(join(realpath(abspath(paths[0]))), '../Scripts/HarvardList.dat')
|
172
|
+
with open(prompt_fpath, encoding='utf-8') as f:
|
173
|
+
lines = f.readlines()
|
174
|
+
|
175
|
+
logger.info(f'Found {len(lines) - 4} entries in prompt file.')
|
176
|
+
# First 4 lines are header stuff, can use remaining directly with simple lookup
|
177
|
+
# example line: '01_02:Glue the sheet ...\n' (paragraph 1, sentence 2)
|
178
|
+
# 11 entries per group, so getting line is 11*(p1-1)+(s2-1)
|
179
|
+
lines = lines[4:]
|
180
|
+
|
181
|
+
processing_func = partial(get_mcgill_speech_manifest_entry, transcript_data=lines)
|
182
|
+
progress = tqdm(total=len(entries), desc='Creating Mcgill Speech manifest data')
|
183
|
+
results = pp_tqdm_imap(processing_func, entries, progress=progress)
|
184
|
+
progress.close()
|
185
|
+
|
186
|
+
with open(output, 'w') as f:
|
187
|
+
for result in results:
|
188
|
+
f.write(json.dumps(result) + '\n')
|
189
|
+
|
163
190
|
end_time = time.monotonic()
|
164
191
|
logger.info('')
|
165
192
|
logger.info(f'Completed in {seconds_to_hms(seconds=end_time - start_time)}')
|
sonusai/onnx_predict.py
CHANGED
@@ -105,7 +105,7 @@ def main() -> None:
|
|
105
105
|
logger.info('')
|
106
106
|
logger.info(f'Run prediction on {input_name}')
|
107
107
|
audio = read_audio(input_name)
|
108
|
-
feature = get_feature_from_audio(audio=audio,
|
108
|
+
feature = get_feature_from_audio(audio=audio, feature_mode=model_metadata.feature)
|
109
109
|
|
110
110
|
predict = pad_and_predict(feature=feature,
|
111
111
|
model_name=model_name,
|
sonusai/plot.py
CHANGED
@@ -314,7 +314,7 @@ def main() -> None:
|
|
314
314
|
raise SonusAIError('Must specify MODEL when input is WAV')
|
315
315
|
|
316
316
|
mixture_audio = read_audio(input_name)
|
317
|
-
feature = get_feature_from_audio(audio=mixture_audio,
|
317
|
+
feature = get_feature_from_audio(audio=mixture_audio, feature_mode=model.feature)
|
318
318
|
fg_config = FeatureGeneratorConfig(feature_mode=model.feature,
|
319
319
|
num_classes=model.output_shape[-1],
|
320
320
|
truth_mutex=False)
|
@@ -406,11 +406,11 @@ def main() -> None:
|
|
406
406
|
title = f'{input_name}'
|
407
407
|
pdf_name = f'{base_name}-plot.pdf'
|
408
408
|
|
409
|
-
# Original size [frames, stride,
|
409
|
+
# Original size [frames, stride, feature_parameters]
|
410
410
|
# Decimate in the stride dimension
|
411
|
-
# Reshape to get frames*decimated_stride,
|
411
|
+
# Reshape to get frames*decimated_stride, feature_parameters
|
412
412
|
if feature.ndim != 3:
|
413
|
-
raise SonusAIError(f'feature does not have 3 dimensions: frames, stride,
|
413
|
+
raise SonusAIError(f'feature does not have 3 dimensions: frames, stride, feature_parameters')
|
414
414
|
spectrogram = feature[:, -fg_step:, :]
|
415
415
|
spectrogram = np.reshape(spectrogram, (spectrogram.shape[0] * spectrogram.shape[1], spectrogram.shape[2]))
|
416
416
|
|
sonusai/post_spenh_targetf.py
CHANGED
@@ -123,7 +123,7 @@ def _process(file: str) -> None:
|
|
123
123
|
from pyaaware import AawareInverseTransform
|
124
124
|
|
125
125
|
from sonusai import SonusAIError
|
126
|
-
from sonusai.mixture import
|
126
|
+
from sonusai.mixture import get_audio_from_transform
|
127
127
|
from sonusai.utils import float_to_int16
|
128
128
|
from sonusai.utils import unstack_complex
|
129
129
|
from sonusai.utils import write_wav
|
@@ -135,13 +135,13 @@ def _process(file: str) -> None:
|
|
135
135
|
raise SonusAIError(f'Error reading {file}: {e}')
|
136
136
|
|
137
137
|
output_name = join(MP_GLOBAL.output_dir, splitext(basename(file))[0] + '.wav')
|
138
|
-
audio, _ =
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
138
|
+
audio, _ = get_audio_from_transform(data=predict,
|
139
|
+
transform=AawareInverseTransform(N=MP_GLOBAL.N,
|
140
|
+
R=MP_GLOBAL.R,
|
141
|
+
bin_start=MP_GLOBAL.bin_start,
|
142
|
+
bin_end=MP_GLOBAL.bin_end,
|
143
|
+
ttype=MP_GLOBAL.ttype,
|
144
|
+
gain=np.float32(1)))
|
145
145
|
write_wav(name=output_name, audio=float_to_int16(audio))
|
146
146
|
|
147
147
|
|
sonusai/utils/__init__.py
CHANGED
@@ -1,33 +1,35 @@
|
|
1
1
|
# SonusAI general utilities
|
2
2
|
from .asl_p56 import asl_p56
|
3
|
+
from .asr import ASRData
|
3
4
|
from .asr import ASRResult
|
4
5
|
from .asr import calc_asr
|
6
|
+
from .audio_devices import get_default_input_device
|
7
|
+
from .audio_devices import get_input_device_index_by_name
|
8
|
+
from .audio_devices import get_input_devices
|
5
9
|
from .braced_glob import braced_glob
|
6
10
|
from .braced_glob import braced_iglob
|
7
11
|
from .calculate_input_shape import calculate_input_shape
|
8
12
|
from .convert_string_to_number import convert_string_to_number
|
13
|
+
from .create_timestamp import create_timestamp
|
9
14
|
from .create_ts_name import create_ts_name
|
10
15
|
from .dataclass_from_dict import dataclass_from_dict
|
11
16
|
from .db import db_to_linear
|
12
17
|
from .db import linear_to_db
|
18
|
+
from .docstring import add_commands_to_docstring
|
19
|
+
from .docstring import trim_docstring
|
13
20
|
from .energy_f import compute_energy_f
|
14
21
|
from .engineering_number import EngineeringNumber
|
15
22
|
from .get_frames_per_batch import get_frames_per_batch
|
16
23
|
from .get_label_names import get_label_names
|
17
24
|
from .grouper import grouper
|
18
25
|
from .human_readable_size import human_readable_size
|
19
|
-
from .keras_utils import check_keras_overrides
|
20
|
-
from .keras_utils import create_onnx_from_keras
|
21
|
-
from .keras_utils import import_and_check_keras_model
|
22
|
-
from .keras_utils import import_keras_model
|
23
|
-
from .keras_utils import keras_onnx
|
24
26
|
from .max_text_width import max_text_width
|
27
|
+
from .model_utils import import_module
|
25
28
|
from .numeric_conversion import float_to_int16
|
26
29
|
from .numeric_conversion import int16_to_float
|
27
30
|
from .onnx_utils import SonusAIMetaData
|
28
31
|
from .onnx_utils import add_sonusai_metadata
|
29
32
|
from .onnx_utils import get_sonusai_metadata
|
30
|
-
from .onnx_utils import replace_stateful_grus
|
31
33
|
from .parallel import pp_imap
|
32
34
|
from .parallel import pp_tqdm_imap
|
33
35
|
from .print_mixture_details import print_class_count
|
@@ -46,6 +48,5 @@ from .stacked_complex import stacked_complex_imag
|
|
46
48
|
from .stacked_complex import stacked_complex_real
|
47
49
|
from .stacked_complex import unstack_complex
|
48
50
|
from .stratified_shuffle_split import stratified_shuffle_split_mixid
|
49
|
-
from .trim_docstring import trim_docstring
|
50
51
|
from .wave import write_wav
|
51
52
|
from .yes_or_no import yes_or_no
|