sonusai 0.15.8__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +35 -4
- sonusai/audiofe.py +237 -0
- sonusai/calc_metric_spenh.py +21 -12
- sonusai/genft.py +2 -1
- sonusai/genmixdb.py +5 -5
- sonusai/lsdb.py +2 -2
- sonusai/main.py +58 -61
- sonusai/mixture/__init__.py +4 -2
- sonusai/mixture/audio.py +0 -34
- sonusai/mixture/config.py +1 -2
- sonusai/mixture/datatypes.py +1 -1
- sonusai/mixture/feature.py +75 -21
- sonusai/mixture/helpers.py +60 -30
- sonusai/mixture/log_duration_and_sizes.py +2 -2
- sonusai/mixture/mixdb.py +13 -10
- sonusai/mixture/spectral_mask.py +14 -14
- sonusai/mixture/truth_functions/data.py +1 -1
- sonusai/mixture/truth_functions/target.py +2 -2
- sonusai/mkmanifest.py +29 -2
- sonusai/onnx_predict.py +1 -1
- sonusai/plot.py +4 -4
- sonusai/post_spenh_targetf.py +8 -8
- sonusai/utils/__init__.py +8 -7
- sonusai/utils/asl_p56.py +3 -3
- sonusai/utils/asr.py +35 -8
- sonusai/utils/asr_functions/__init__.py +0 -5
- sonusai/utils/asr_functions/aaware_whisper.py +2 -2
- sonusai/utils/asr_manifest_functions/__init__.py +1 -0
- sonusai/utils/asr_manifest_functions/mcgill_speech.py +29 -0
- sonusai/utils/audio_devices.py +41 -0
- sonusai/utils/calculate_input_shape.py +3 -4
- sonusai/utils/create_timestamp.py +5 -0
- sonusai/utils/{trim_docstring.py → docstring.py} +20 -0
- sonusai/utils/model_utils.py +30 -0
- sonusai/utils/onnx_utils.py +19 -45
- sonusai/utils/reshape.py +11 -11
- sonusai/utils/wave.py +12 -5
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/METADATA +8 -19
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/RECORD +41 -54
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/WHEEL +1 -1
- sonusai/data_generator/__init__.py +0 -5
- sonusai/data_generator/dataset_from_mixdb.py +0 -143
- sonusai/data_generator/keras_from_mixdb.py +0 -169
- sonusai/data_generator/torch_from_mixdb.py +0 -122
- sonusai/evaluate.py +0 -245
- sonusai/keras_onnx.py +0 -86
- sonusai/keras_predict.py +0 -231
- sonusai/keras_train.py +0 -334
- sonusai/torchl_onnx.py +0 -216
- sonusai/torchl_predict.py +0 -547
- sonusai/torchl_train.py +0 -223
- sonusai/utils/asr_functions/aixplain_whisper.py +0 -59
- sonusai/utils/asr_functions/data.py +0 -16
- sonusai/utils/asr_functions/deepgram.py +0 -97
- sonusai/utils/asr_functions/fastwhisper.py +0 -90
- sonusai/utils/asr_functions/google.py +0 -95
- sonusai/utils/asr_functions/whisper.py +0 -49
- sonusai/utils/keras_utils.py +0 -226
- {sonusai-0.15.8.dist-info → sonusai-0.16.0.dist-info}/entry_points.txt +0 -0
sonusai/utils/asl_p56.py
CHANGED
@@ -22,7 +22,7 @@ def asl_p56(audio: AudioT) -> float:
|
|
22
22
|
# Hangover time in seconds
|
23
23
|
H = 0.2
|
24
24
|
# Rounded up to next integer
|
25
|
-
|
25
|
+
H_samples = np.ceil(H * SAMPLE_RATE)
|
26
26
|
|
27
27
|
# Margin in dB, difference between threshold and active speech level
|
28
28
|
M = 15.9
|
@@ -40,7 +40,7 @@ def asl_p56(audio: AudioT) -> float:
|
|
40
40
|
a = np.full(thresh_num, -1)
|
41
41
|
|
42
42
|
# Hangover counter for each threshold
|
43
|
-
h = np.full(thresh_num,
|
43
|
+
h = np.full(thresh_num, H_samples)
|
44
44
|
|
45
45
|
# Long-term level square energy of audio
|
46
46
|
sq = sum(np.square(audio))
|
@@ -55,7 +55,7 @@ def asl_p56(audio: AudioT) -> float:
|
|
55
55
|
if q[k] >= c[j]:
|
56
56
|
a[j] = a[j] + 1
|
57
57
|
h[j] = 0
|
58
|
-
elif h[j] <
|
58
|
+
elif h[j] < H_samples:
|
59
59
|
a[j] = a[j] + 1
|
60
60
|
h[j] = h[j] + 1
|
61
61
|
else:
|
sonusai/utils/asr.py
CHANGED
@@ -1,10 +1,22 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import Any
|
3
|
+
from typing import Callable
|
3
4
|
from typing import Optional
|
4
5
|
|
5
6
|
from sonusai.mixture import AudioT
|
6
7
|
|
7
8
|
|
9
|
+
@dataclass(frozen=True)
|
10
|
+
class ASRData:
|
11
|
+
audio: AudioT
|
12
|
+
whisper_model: Optional[Any] = None
|
13
|
+
whisper_model_name: Optional[str] = None
|
14
|
+
device: Optional[str] = None
|
15
|
+
cpu_threads: Optional[int] = None
|
16
|
+
compute_type: Optional[str] = None
|
17
|
+
beam_size: Optional[int] = None
|
18
|
+
|
19
|
+
|
8
20
|
@dataclass(frozen=True)
|
9
21
|
class ASRResult:
|
10
22
|
text: str
|
@@ -16,8 +28,24 @@ class ASRResult:
|
|
16
28
|
asr_cpu_time: Optional[float] = None
|
17
29
|
|
18
30
|
|
31
|
+
def get_available_engines() -> dict[str, Callable[[ASRData], ASRResult]]:
|
32
|
+
from importlib import import_module
|
33
|
+
from pkgutil import iter_modules
|
34
|
+
|
35
|
+
module = import_module('sonusai.utils.asr_functions')
|
36
|
+
engines = {method: getattr(module, method) for method in dir(module) if not method.startswith('_')}
|
37
|
+
for _, name, _ in iter_modules():
|
38
|
+
if name.startswith('sonusai_asr_'):
|
39
|
+
module = import_module(f'{name}.asr_functions')
|
40
|
+
for method in dir(module):
|
41
|
+
if not method.startswith('_'):
|
42
|
+
engines[method] = getattr(module, method)
|
43
|
+
|
44
|
+
return engines
|
45
|
+
|
46
|
+
|
19
47
|
def calc_asr(audio: AudioT | str,
|
20
|
-
engine: Optional[str] = '
|
48
|
+
engine: Optional[str] = 'aaware_whisper',
|
21
49
|
whisper_model: Optional[Any] = None,
|
22
50
|
whisper_model_name: Optional[str] = 'tiny',
|
23
51
|
device: Optional[str] = 'cpu',
|
@@ -43,15 +71,14 @@ def calc_asr(audio: AudioT | str,
|
|
43
71
|
|
44
72
|
from sonusai import SonusAIError
|
45
73
|
from sonusai.mixture import read_audio
|
46
|
-
|
47
|
-
|
74
|
+
|
75
|
+
available_engines = get_available_engines()
|
76
|
+
if engine not in available_engines:
|
77
|
+
raise SonusAIError(f'Unsupported ASR function: {engine}')
|
48
78
|
|
49
79
|
if not isinstance(audio, np.ndarray):
|
50
80
|
audio = copy(read_audio(audio))
|
51
81
|
|
52
|
-
data =
|
82
|
+
data = ASRData(audio, whisper_model, whisper_model_name, device, cpu_threads, compute_type, beam_size)
|
53
83
|
|
54
|
-
|
55
|
-
return getattr(asr_functions, engine)(data)
|
56
|
-
except AttributeError:
|
57
|
-
raise SonusAIError(f'Unsupported ASR function: {engine}')
|
84
|
+
return available_engines[engine](data)
|
@@ -1,8 +1,8 @@
|
|
1
|
+
from sonusai.utils import ASRData
|
1
2
|
from sonusai.utils import ASRResult
|
2
|
-
from sonusai.utils.asr_functions.data import Data
|
3
3
|
|
4
4
|
|
5
|
-
def aaware_whisper(data:
|
5
|
+
def aaware_whisper(data: ASRData) -> ASRResult:
|
6
6
|
import tempfile
|
7
7
|
from math import exp
|
8
8
|
from os import getenv
|
@@ -4,3 +4,4 @@ from .librispeech import collect_librispeech_transcripts
|
|
4
4
|
from .librispeech import get_librispeech_manifest_entry
|
5
5
|
from .vctk_noisy_speech import collect_vctk_noisy_speech_transcripts
|
6
6
|
from .vctk_noisy_speech import get_vctk_noisy_speech_manifest_entry
|
7
|
+
from .mcgill_speech import get_mcgill_speech_manifest_entry
|
@@ -0,0 +1,29 @@
|
|
1
|
+
from sonusai.utils.asr_manifest_functions import PathInfo
|
2
|
+
|
3
|
+
|
4
|
+
def get_mcgill_speech_manifest_entry(entry: PathInfo, transcript_data: list[str]) -> dict:
|
5
|
+
from os.path import splitext
|
6
|
+
from os.path import basename
|
7
|
+
from subprocess import check_output
|
8
|
+
|
9
|
+
from sonusai import SonusAIError
|
10
|
+
|
11
|
+
name = splitext(entry.abs_path)[0]
|
12
|
+
duration = float(check_output(f'soxi -D {entry.abs_path}', shell=True))
|
13
|
+
# i.e., from MA01_02.wav, get 01_02
|
14
|
+
promptname = basename(name)[2:]
|
15
|
+
# paragraph num
|
16
|
+
pnum = int(promptname[0:2])
|
17
|
+
snum = int(promptname[3:5])
|
18
|
+
idx = 11 * (pnum - 1) + (snum - 1)
|
19
|
+
try:
|
20
|
+
# remove prompt-id prefix and \n suffix
|
21
|
+
text = transcript_data[idx][6:-1]
|
22
|
+
except IndexError:
|
23
|
+
raise SonusAIError(f'Could not find {promptname}, idx {idx} in transcript data')
|
24
|
+
|
25
|
+
return {
|
26
|
+
'audio_filepath': entry.audio_filepath,
|
27
|
+
'text': text,
|
28
|
+
'duration': duration,
|
29
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
import pyaudio
|
2
|
+
|
3
|
+
|
4
|
+
def get_input_device_index_by_name(p: pyaudio.PyAudio, name: str = None) -> int:
|
5
|
+
info = p.get_host_api_info_by_index(0)
|
6
|
+
device_count = info.get('deviceCount')
|
7
|
+
for i in range(0, device_count):
|
8
|
+
device_info = p.get_device_info_by_host_api_device_index(0, i)
|
9
|
+
if name is None:
|
10
|
+
device_name = None
|
11
|
+
else:
|
12
|
+
device_name = device_info.get('name')
|
13
|
+
if name == device_name and device_info.get('maxInputChannels') > 0:
|
14
|
+
return i
|
15
|
+
|
16
|
+
raise ValueError(f'Could not find {name}')
|
17
|
+
|
18
|
+
|
19
|
+
def get_input_devices(p: pyaudio.PyAudio) -> list[str]:
|
20
|
+
names = []
|
21
|
+
info = p.get_host_api_info_by_index(0)
|
22
|
+
device_count = info.get('deviceCount')
|
23
|
+
for i in range(0, device_count):
|
24
|
+
device_info = p.get_device_info_by_host_api_device_index(0, i)
|
25
|
+
device_name = device_info.get('name')
|
26
|
+
if device_info.get('maxInputChannels') > 0:
|
27
|
+
names.append(device_name)
|
28
|
+
|
29
|
+
return names
|
30
|
+
|
31
|
+
|
32
|
+
def get_default_input_device(p: pyaudio.PyAudio) -> str:
|
33
|
+
info = p.get_host_api_info_by_index(0)
|
34
|
+
device_count = info.get('deviceCount')
|
35
|
+
for i in range(0, device_count):
|
36
|
+
device_info = p.get_device_info_by_host_api_device_index(0, i)
|
37
|
+
device_name = device_info.get('name')
|
38
|
+
if device_info.get('maxInputChannels') > 0:
|
39
|
+
return device_name
|
40
|
+
|
41
|
+
raise ValueError('No input audio devices found')
|
@@ -13,13 +13,12 @@ def calculate_input_shape(feature: str,
|
|
13
13
|
"""
|
14
14
|
from pyaaware import FeatureGenerator
|
15
15
|
|
16
|
-
|
17
|
-
fg = FeatureGenerator(feature_mode=feature, num_classes=2)
|
16
|
+
fg = FeatureGenerator(feature_mode=feature)
|
18
17
|
|
19
18
|
if flatten:
|
20
|
-
in_shape = [fg.stride * fg.
|
19
|
+
in_shape = [fg.stride * fg.feature_parameters]
|
21
20
|
else:
|
22
|
-
in_shape = [fg.stride, fg.
|
21
|
+
in_shape = [fg.stride, fg.feature_parameters]
|
23
22
|
|
24
23
|
if timesteps > 0:
|
25
24
|
in_shape.insert(0, timesteps)
|
@@ -28,3 +28,23 @@ def trim_docstring(docstring: str) -> str:
|
|
28
28
|
|
29
29
|
# Return a single string
|
30
30
|
return '\n'.join(trimmed)
|
31
|
+
|
32
|
+
|
33
|
+
def add_commands_to_docstring(docstring: str, plugin_docstrings: list[str]) -> str:
|
34
|
+
"""Add commands to docstring"""
|
35
|
+
import sonusai
|
36
|
+
|
37
|
+
lines = docstring.splitlines()
|
38
|
+
|
39
|
+
start = lines.index('The sonusai commands are:')
|
40
|
+
end = lines.index('', start)
|
41
|
+
|
42
|
+
commands = sonusai.commands_doc.splitlines()
|
43
|
+
for plugin_docstring in plugin_docstrings:
|
44
|
+
commands.extend(plugin_docstring.splitlines())
|
45
|
+
commands.sort()
|
46
|
+
commands = list(filter(None, commands))
|
47
|
+
|
48
|
+
lines = lines[:start + 1] + commands + lines[end:]
|
49
|
+
|
50
|
+
return '\n'.join(lines)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
|
4
|
+
def import_module(name: str) -> Any:
|
5
|
+
"""Import a Python module adding the module file's directory to the Python system path so that relative package
|
6
|
+
imports are found correctly.
|
7
|
+
"""
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
from importlib import import_module
|
11
|
+
|
12
|
+
from sonusai import SonusAIError
|
13
|
+
|
14
|
+
try:
|
15
|
+
path = os.path.dirname(name)
|
16
|
+
if len(path) < 1:
|
17
|
+
path = './'
|
18
|
+
|
19
|
+
# Add model file location to system path
|
20
|
+
sys.path.append(os.path.abspath(path))
|
21
|
+
|
22
|
+
try:
|
23
|
+
root = os.path.splitext(os.path.basename(name))[0]
|
24
|
+
model = import_module(root)
|
25
|
+
except Exception as e:
|
26
|
+
raise SonusAIError(f'Error: could not import model from {name}: {e}.')
|
27
|
+
except Exception as e:
|
28
|
+
raise SonusAIError(f'Error: could not find {name}: {e}.')
|
29
|
+
|
30
|
+
return model
|
sonusai/utils/onnx_utils.py
CHANGED
@@ -3,33 +3,15 @@ from dataclasses import dataclass
|
|
3
3
|
from onnxruntime import InferenceSession
|
4
4
|
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
for i in range(len(keras_model.layers)):
|
16
|
-
layer = keras_model.layers[i]
|
17
|
-
if isinstance(layer, GRU):
|
18
|
-
if layer.stateful:
|
19
|
-
stateful_gru_names.append(layer.name)
|
20
|
-
|
21
|
-
for node_index in range(len(onnx_model.graph.node)):
|
22
|
-
node = onnx_model.graph.node[node_index]
|
23
|
-
replace = False
|
24
|
-
if node.op_type == 'GRU':
|
25
|
-
for i in node.input:
|
26
|
-
for n in stateful_gru_names:
|
27
|
-
if n in i:
|
28
|
-
replace = True
|
29
|
-
if node.name in stateful_gru_names or replace:
|
30
|
-
node.op_type = 'SGRU'
|
31
|
-
|
32
|
-
return onnx_model
|
6
|
+
@dataclass(frozen=True)
|
7
|
+
class SonusAIMetaData:
|
8
|
+
input_shape: list[int]
|
9
|
+
output_shape: list[int]
|
10
|
+
flattened: bool
|
11
|
+
timestep: bool
|
12
|
+
channel: bool
|
13
|
+
mutex: bool
|
14
|
+
feature: str
|
33
15
|
|
34
16
|
|
35
17
|
def add_sonusai_metadata(model,
|
@@ -38,13 +20,14 @@ def add_sonusai_metadata(model,
|
|
38
20
|
has_channel: bool = False,
|
39
21
|
is_mutex: bool = True,
|
40
22
|
feature: str = ''):
|
41
|
-
"""Add SonusAI metadata to ONNX model.
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
23
|
+
"""Add SonusAI metadata to an ONNX model.
|
24
|
+
|
25
|
+
:param model: ONNX model
|
26
|
+
:param is_flattened: Model feature data is flattened
|
27
|
+
:param has_timestep: Model has timestep dimension
|
28
|
+
:param has_channel: Model has channel dimension
|
29
|
+
:param is_mutex: Model label output is mutually exclusive
|
30
|
+
:param feature: Model feature type
|
48
31
|
"""
|
49
32
|
is_flattened_flag = model.metadata_props.add()
|
50
33
|
is_flattened_flag.key = 'is_flattened'
|
@@ -69,18 +52,9 @@ def add_sonusai_metadata(model,
|
|
69
52
|
return model
|
70
53
|
|
71
54
|
|
72
|
-
@dataclass(frozen=True)
|
73
|
-
class SonusAIMetaData:
|
74
|
-
input_shape: list[int]
|
75
|
-
output_shape: list[int]
|
76
|
-
flattened: bool
|
77
|
-
timestep: bool
|
78
|
-
channel: bool
|
79
|
-
mutex: bool
|
80
|
-
feature: str
|
81
|
-
|
82
|
-
|
83
55
|
def get_sonusai_metadata(model: InferenceSession) -> SonusAIMetaData:
|
56
|
+
"""Get SonusAI metadata from an ONNX model.
|
57
|
+
"""
|
84
58
|
m = model.get_modelmeta().custom_metadata_map
|
85
59
|
return SonusAIMetaData(input_shape=model.get_inputs()[0].shape,
|
86
60
|
output_shape=model.get_outputs()[0].shape,
|
sonusai/utils/reshape.py
CHANGED
@@ -17,14 +17,14 @@ def reshape_inputs(feature: Feature,
|
|
17
17
|
timesteps: int = 0,
|
18
18
|
flatten: bool = False,
|
19
19
|
add1ch: bool = False) -> tuple[Feature, Optional[Truth]]:
|
20
|
-
"""Check SonusAI feature and truth data and reshape feature of size [frames, strides,
|
20
|
+
"""Check SonusAI feature and truth data and reshape feature of size [frames, strides, feature_parameters] into
|
21
21
|
one of several options:
|
22
22
|
|
23
23
|
If timesteps > 0: (i.e., for recurrent NNs):
|
24
|
-
no-flatten, no-channel: [sequences, timesteps, strides,
|
25
|
-
flatten, no-channel: [sequences, timesteps, strides*
|
26
|
-
no-flatten, add-1channel: [sequences, timesteps, strides,
|
27
|
-
flatten, add-1channel: [sequences, timesteps, strides*
|
24
|
+
no-flatten, no-channel: [sequences, timesteps, strides, feature_parameters] (4-dim)
|
25
|
+
flatten, no-channel: [sequences, timesteps, strides*feature_parameters] (3-dim)
|
26
|
+
no-flatten, add-1channel: [sequences, timesteps, strides, feature_parameters, 1] (5-dim)
|
27
|
+
flatten, add-1channel: [sequences, timesteps, strides*feature_parameters, 1] (4-dim)
|
28
28
|
|
29
29
|
If batch_size is None, then do not reshape; just calculate new input shape and return.
|
30
30
|
|
@@ -40,7 +40,7 @@ def reshape_inputs(feature: Feature,
|
|
40
40
|
"""
|
41
41
|
from sonusai import SonusAIError
|
42
42
|
|
43
|
-
frames, strides,
|
43
|
+
frames, strides, feature_parameters = feature.shape
|
44
44
|
if truth is not None:
|
45
45
|
truth_frames, num_classes = truth.shape
|
46
46
|
# Double-check correctness of inputs
|
@@ -50,7 +50,7 @@ def reshape_inputs(feature: Feature,
|
|
50
50
|
num_classes = None
|
51
51
|
|
52
52
|
if flatten:
|
53
|
-
feature = np.reshape(feature, (frames, strides *
|
53
|
+
feature = np.reshape(feature, (frames, strides * feature_parameters))
|
54
54
|
|
55
55
|
# Reshape for Keras/TF recurrent models that require timesteps/sequence length dimension
|
56
56
|
if timesteps > 0:
|
@@ -73,14 +73,14 @@ def reshape_inputs(feature: Feature,
|
|
73
73
|
|
74
74
|
# Reshape
|
75
75
|
if feature.ndim == 2: # flattened input
|
76
|
-
# was [frames,
|
77
|
-
feature = np.reshape(feature, (sequences, timesteps, strides *
|
76
|
+
# was [frames, feature_parameters*timesteps]
|
77
|
+
feature = np.reshape(feature, (sequences, timesteps, strides * feature_parameters))
|
78
78
|
if truth is not None:
|
79
79
|
# was [frames, num_classes]
|
80
80
|
truth = np.reshape(truth, (sequences, timesteps, num_classes))
|
81
81
|
elif feature.ndim == 3: # un-flattened input
|
82
|
-
# was [frames,
|
83
|
-
feature = np.reshape(feature, (sequences, timesteps, strides,
|
82
|
+
# was [frames, feature_parameters, timesteps]
|
83
|
+
feature = np.reshape(feature, (sequences, timesteps, strides, feature_parameters))
|
84
84
|
if truth is not None:
|
85
85
|
# was [frames, num_classes]
|
86
86
|
truth = np.reshape(truth, (sequences, timesteps, num_classes))
|
sonusai/utils/wave.py
CHANGED
@@ -5,15 +5,22 @@ from sonusai.mixture.datatypes import AudioT
|
|
5
5
|
def write_wav(name: str, audio: AudioT, sample_rate: int = SAMPLE_RATE) -> None:
|
6
6
|
""" Write a simple, uncompressed WAV file.
|
7
7
|
|
8
|
-
To write multiple channels, use a 2D array of shape [
|
8
|
+
To write multiple channels, use a 2D array of shape [channels, samples].
|
9
9
|
The bits per sample and PCM/float are determined by the data type.
|
10
10
|
|
11
11
|
"""
|
12
|
-
import numpy as np
|
13
12
|
import torch
|
14
13
|
import torchaudio
|
15
14
|
|
16
|
-
|
17
|
-
audio = np.reshape(audio, (1, audio.shape[0]))
|
15
|
+
data = torch.tensor(audio)
|
18
16
|
|
19
|
-
|
17
|
+
if data.dim() == 1:
|
18
|
+
data = torch.reshape(data, (1, data.shape[0]))
|
19
|
+
if data.dim() != 2:
|
20
|
+
raise ValueError(f'audio must be a 1D or 2D array')
|
21
|
+
|
22
|
+
# Assuming data has more samples than channels, check if array needs to be transposed
|
23
|
+
if data.shape[1] < data.shape[0]:
|
24
|
+
data = torch.transpose(data, 0, 1)
|
25
|
+
|
26
|
+
torchaudio.save(uri=name, src=data, sample_rate=sample_rate)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonusai
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.16.0
|
4
4
|
Summary: Framework for building deep neural network models for sound, speech, and voice AI
|
5
5
|
Home-page: https://aaware.com
|
6
6
|
License: GPL-3.0-only
|
@@ -15,50 +15,39 @@ Classifier: Programming Language :: Python :: 3.9
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
17
17
|
Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
|
18
|
-
Requires-Dist: aixplain (>=0.2.6,<0.3.0)
|
19
|
-
Requires-Dist: ctranslate2 (==4.1.0)
|
20
18
|
Requires-Dist: dataclasses-json (>=0.6.1,<0.7.0)
|
21
|
-
Requires-Dist: deepgram-sdk (>=3.0.0,<4.0.0)
|
22
19
|
Requires-Dist: docopt (>=0.6.2,<0.7.0)
|
23
|
-
Requires-Dist: faster-whisper (>=1.0.1,<2.0.0)
|
24
20
|
Requires-Dist: h5py (>=3.11.0,<4.0.0)
|
25
21
|
Requires-Dist: jiwer (>=3.0.3,<4.0.0)
|
26
|
-
Requires-Dist: keras (>=3.1.1,<4.0.0)
|
27
|
-
Requires-Dist: keras-tuner (>=1.4.7,<2.0.0)
|
28
22
|
Requires-Dist: librosa (>=0.10.1,<0.11.0)
|
29
|
-
Requires-Dist: lightning (>=2.2,<2.3)
|
30
23
|
Requires-Dist: matplotlib (>=3.8.0,<4.0.0)
|
31
24
|
Requires-Dist: onnx (>=1.14.1,<2.0.0)
|
32
25
|
Requires-Dist: onnxruntime (>=1.16.1,<2.0.0)
|
33
26
|
Requires-Dist: paho-mqtt (>=2.0.0,<3.0.0)
|
34
27
|
Requires-Dist: pandas (>=2.1.1,<3.0.0)
|
35
28
|
Requires-Dist: pesq (>=0.0.4,<0.0.5)
|
36
|
-
Requires-Dist: pyaaware (>=1.5.
|
29
|
+
Requires-Dist: pyaaware (>=1.5.7,<2.0.0)
|
30
|
+
Requires-Dist: pyaudio (>=0.2.14,<0.3.0)
|
37
31
|
Requires-Dist: pydub (>=0.25.1,<0.26.0)
|
38
32
|
Requires-Dist: pystoi (>=0.4.0,<0.5.0)
|
39
|
-
Requires-Dist: python-magic (>=0.4.27,<0.5.0)
|
40
33
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
41
34
|
Requires-Dist: samplerate (>=0.2.1,<0.3.0)
|
42
35
|
Requires-Dist: soundfile (>=0.12.1,<0.13.0)
|
43
36
|
Requires-Dist: sox (>=1.4.1,<2.0.0)
|
44
|
-
Requires-Dist: speechrecognition (>=3.10.1,<4.0.0)
|
45
|
-
Requires-Dist: tensorflow (>=2.15.0,<3.0.0)
|
46
|
-
Requires-Dist: tf2onnx (>=1.15.1,<2.0.0)
|
47
37
|
Requires-Dist: torch (>=2.2,<2.3)
|
48
38
|
Requires-Dist: torchaudio (>=2.2,<2.3)
|
49
|
-
Requires-Dist: torchinfo (>=1.8.0,<2.0.0)
|
50
39
|
Requires-Dist: tqdm (>=4.66.1,<5.0.0)
|
51
40
|
Description-Content-Type: text/x-rst
|
52
41
|
|
53
|
-
|
42
|
+
SonusAI: Framework for simplified creation of deep NN models for sound, speech, and voice AI
|
54
43
|
|
55
|
-
|
44
|
+
SonusAI includes functions for pre-processing training and validation data and
|
56
45
|
creating performance metrics reports for key types of Keras models:
|
57
46
|
- recurrent, convolutional, or a combination (i.e. RCNNs)
|
58
47
|
- binary, multiclass single-label, multiclass multi-label, and regression
|
59
48
|
- training with data augmentations: noise mixing, pitch and time stretch, etc.
|
60
49
|
|
61
|
-
|
62
|
-
- Aaware Inc. sonusai
|
63
|
-
- Keras model scripts: User python scripts for
|
50
|
+
SonusAI python functions are used by:
|
51
|
+
- Aaware Inc. sonusai framework: Easily create train/validation data, run prediction, evaluate model performance
|
52
|
+
- Keras model scripts: User python scripts for Keras model creation, training, and prediction. These can use sonusai-specific data but also some general useful utilities for training rnn-based models like CRNN's, DSCRNN's, etc. in Keras.
|
64
53
|
|