sonusai 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/audiofe.py +22 -51
- sonusai/calc_metric_spenh.py +206 -213
- sonusai/doc/doc.py +1 -1
- sonusai/mixture/__init__.py +2 -0
- sonusai/mixture/audio.py +12 -0
- sonusai/mixture/datatypes.py +11 -3
- sonusai/mixture/mixdb.py +101 -0
- sonusai/mixture/soundfile_audio.py +39 -0
- sonusai/mixture/speaker_metadata.py +35 -0
- sonusai/mixture/torchaudio_audio.py +22 -0
- sonusai/mkmanifest.py +1 -1
- sonusai/onnx_predict.py +114 -410
- sonusai/queries/queries.py +1 -1
- sonusai/speech/__init__.py +3 -0
- sonusai/speech/l2arctic.py +116 -0
- sonusai/speech/librispeech.py +99 -0
- sonusai/speech/mcgill.py +70 -0
- sonusai/speech/textgrid.py +100 -0
- sonusai/speech/timit.py +135 -0
- sonusai/speech/types.py +12 -0
- sonusai/speech/vctk.py +52 -0
- sonusai/speech/voxceleb2.py +86 -0
- sonusai/utils/__init__.py +2 -1
- sonusai/utils/asr_manifest_functions/__init__.py +0 -1
- sonusai/utils/asr_manifest_functions/data.py +0 -8
- sonusai/utils/asr_manifest_functions/librispeech.py +1 -1
- sonusai/utils/asr_manifest_functions/mcgill_speech.py +1 -1
- sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +1 -1
- sonusai/utils/braced_glob.py +7 -3
- sonusai/utils/onnx_utils.py +110 -106
- sonusai/utils/path_info.py +7 -0
- {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/METADATA +2 -1
- {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/RECORD +35 -30
- {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/WHEEL +1 -1
- sonusai/calc_metric_spenh-save.py +0 -1334
- sonusai/onnx_predict-old.py +0 -240
- sonusai/onnx_predict-save.py +0 -487
- sonusai/ovino_predict.py +0 -508
- sonusai/ovino_query_devices.py +0 -47
- sonusai/torchl_onnx-old.py +0 -216
- {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/entry_points.txt +0 -0
sonusai/doc/doc.py
CHANGED
@@ -40,7 +40,7 @@ Required field:
|
|
40
40
|
'name'
|
41
41
|
File name. May be one of the following:
|
42
42
|
|
43
|
-
audio Supported formats are .wav, .mp3, .aif, .flac, and .ogg
|
43
|
+
audio Supported formats are .wav, .mp3, .m4a, .aif, .flac, and .ogg
|
44
44
|
glob Matches file glob patterns
|
45
45
|
.yml The given YAML file is parsed into the list
|
46
46
|
.txt Each line in the given text file indicates an item which
|
sonusai/mixture/__init__.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
from .audio import get_duration
|
3
3
|
from .audio import get_next_noise
|
4
4
|
from .audio import get_num_samples
|
5
|
+
from .audio import get_sample_rate
|
5
6
|
from .audio import read_audio
|
6
7
|
from .audio import read_ir
|
7
8
|
from .audio import validate_input_file
|
@@ -73,6 +74,7 @@ from .datatypes import Predict
|
|
73
74
|
from .datatypes import Segsnr
|
74
75
|
from .datatypes import SpectralMask
|
75
76
|
from .datatypes import SpectralMasks
|
77
|
+
from .datatypes import SpeechMetadata
|
76
78
|
from .datatypes import TargetFile
|
77
79
|
from .datatypes import TargetFiles
|
78
80
|
from .datatypes import TransformConfig
|
sonusai/mixture/audio.py
CHANGED
@@ -45,6 +45,18 @@ def validate_input_file(input_filepath: str) -> None:
|
|
45
45
|
raise SonusAIError(f'This installation cannot process .{ext} files')
|
46
46
|
|
47
47
|
|
48
|
+
@lru_cache
|
49
|
+
def get_sample_rate(name: str) -> int:
|
50
|
+
"""Get sample rate from audio file
|
51
|
+
|
52
|
+
:param name: File name
|
53
|
+
:return: Sample rate
|
54
|
+
"""
|
55
|
+
from .soundfile_audio import get_sample_rate
|
56
|
+
|
57
|
+
return get_sample_rate(name)
|
58
|
+
|
59
|
+
|
48
60
|
@lru_cache
|
49
61
|
def read_audio(name: str) -> AudioT:
|
50
62
|
"""Read audio data from a file
|
sonusai/mixture/datatypes.py
CHANGED
@@ -6,6 +6,7 @@ from typing import TypeAlias
|
|
6
6
|
import numpy as np
|
7
7
|
import numpy.typing as npt
|
8
8
|
from dataclasses_json import DataClassJsonMixin
|
9
|
+
from praatio.utilities.constants import Interval
|
9
10
|
|
10
11
|
AudioT: TypeAlias = npt.NDArray[np.float32]
|
11
12
|
AudiosT: TypeAlias = list[AudioT]
|
@@ -249,7 +250,7 @@ class Target(DataClassSonusAIMixin):
|
|
249
250
|
gain: Optional[float] = None
|
250
251
|
|
251
252
|
|
252
|
-
Targets = list[Target]
|
253
|
+
Targets: TypeAlias = list[Target]
|
253
254
|
|
254
255
|
|
255
256
|
@dataclass
|
@@ -276,11 +277,15 @@ class Mixture(DataClassSonusAIMixin):
|
|
276
277
|
return self.noise.file_id
|
277
278
|
|
278
279
|
@property
|
279
|
-
def
|
280
|
+
def target_ids(self) -> list[int]:
|
280
281
|
return [target.file_id for target in self.targets]
|
281
282
|
|
283
|
+
@property
|
284
|
+
def target_augmentations(self) -> list[Augmentation]:
|
285
|
+
return [target.augmentation for target in self.targets]
|
286
|
+
|
282
287
|
|
283
|
-
Mixtures = list[Mixture]
|
288
|
+
Mixtures: TypeAlias = list[Mixture]
|
284
289
|
|
285
290
|
|
286
291
|
@dataclass(frozen=True)
|
@@ -326,3 +331,6 @@ class MixtureDatabaseConfig(DataClassSonusAIMixin):
|
|
326
331
|
target_files: Optional[TargetFiles] = None
|
327
332
|
truth_mutex: Optional[bool] = None
|
328
333
|
truth_reduction_function: Optional[str] = None
|
334
|
+
|
335
|
+
|
336
|
+
SpeechMetadata: TypeAlias = str | list[Interval] | None
|
sonusai/mixture/mixdb.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
from functools import cached_property
|
2
2
|
from functools import lru_cache
|
3
3
|
from functools import partial
|
4
|
+
from pathlib import Path
|
4
5
|
from sqlite3 import Connection
|
5
6
|
from sqlite3 import Cursor
|
6
7
|
from typing import Any
|
8
|
+
from typing import Callable
|
7
9
|
from typing import Optional
|
8
10
|
|
11
|
+
from praatio import textgrid
|
12
|
+
from praatio.utilities.constants import Interval
|
13
|
+
|
9
14
|
from sonusai.mixture.datatypes import AudioF
|
10
15
|
from sonusai.mixture.datatypes import AudioT
|
11
16
|
from sonusai.mixture.datatypes import AudiosF
|
@@ -23,11 +28,13 @@ from sonusai.mixture.datatypes import NoiseFiles
|
|
23
28
|
from sonusai.mixture.datatypes import Segsnr
|
24
29
|
from sonusai.mixture.datatypes import SpectralMask
|
25
30
|
from sonusai.mixture.datatypes import SpectralMasks
|
31
|
+
from sonusai.mixture.datatypes import SpeechMetadata
|
26
32
|
from sonusai.mixture.datatypes import TargetFile
|
27
33
|
from sonusai.mixture.datatypes import TargetFiles
|
28
34
|
from sonusai.mixture.datatypes import TransformConfig
|
29
35
|
from sonusai.mixture.datatypes import Truth
|
30
36
|
from sonusai.mixture.datatypes import UniversalSNR
|
37
|
+
from sonusai.mixture.tokenized_shell_vars import tokenized_expand
|
31
38
|
|
32
39
|
|
33
40
|
def db_file(location: str, test: bool = False) -> str:
|
@@ -81,6 +88,7 @@ class MixtureDatabase:
|
|
81
88
|
def __init__(self, location: str, test: bool = False) -> None:
|
82
89
|
self.location = location
|
83
90
|
self.db = partial(SQLiteContextManager, self.location, test)
|
91
|
+
self._speaker_metadata_tiers: list[str] = []
|
84
92
|
|
85
93
|
@cached_property
|
86
94
|
def json(self) -> str:
|
@@ -1069,6 +1077,99 @@ class MixtureDatabase:
|
|
1069
1077
|
|
1070
1078
|
return class_count
|
1071
1079
|
|
1080
|
+
@cached_property
|
1081
|
+
def _speech_metadata(self) -> dict[str, dict[str, SpeechMetadata]]:
|
1082
|
+
"""Speech metadata is a nested dictionary.
|
1083
|
+
|
1084
|
+
data['target_file_name'] = { 'tier': SpeechMetadata, ... }
|
1085
|
+
"""
|
1086
|
+
data: dict[str, dict[str, SpeechMetadata]] = {}
|
1087
|
+
for file in self.target_files:
|
1088
|
+
data[file.name] = {}
|
1089
|
+
file_name, _ = tokenized_expand(file.name)
|
1090
|
+
tg_file = Path(file_name).with_suffix('.TextGrid')
|
1091
|
+
if tg_file.exists():
|
1092
|
+
tg = textgrid.openTextgrid(str(tg_file), includeEmptyIntervals=False)
|
1093
|
+
for tier in tg.tierNames:
|
1094
|
+
entries = tg.getTier(tier).entries
|
1095
|
+
if len(entries) > 1:
|
1096
|
+
data[file.name][tier] = entries
|
1097
|
+
else:
|
1098
|
+
data[file.name][tier] = entries[0].label
|
1099
|
+
|
1100
|
+
return data
|
1101
|
+
|
1102
|
+
@cached_property
|
1103
|
+
def speech_metadata_tiers(self) -> list[str]:
|
1104
|
+
return sorted(list(set([key for value in self._speech_metadata.values() for key in value.keys()])))
|
1105
|
+
|
1106
|
+
def speech_metadata_all(self, tier: str) -> list[SpeechMetadata]:
|
1107
|
+
results = sorted(
|
1108
|
+
set([value.get(tier) for value in self._speech_metadata.values() if isinstance(value.get(tier), str)]))
|
1109
|
+
return results
|
1110
|
+
|
1111
|
+
def mixids_for_speech_metadata(self,
|
1112
|
+
tier: str,
|
1113
|
+
value: str,
|
1114
|
+
predicate: Callable[[str], bool] = None) -> list[int]:
|
1115
|
+
"""Get a list of mixids for the given speech metadata tier.
|
1116
|
+
|
1117
|
+
If 'predicate' is None, then include mixids whose tier values are equal to the given 'value'. If 'predicate' is
|
1118
|
+
not None, then ignore 'value' and use the given callable to determine which entries to include.
|
1119
|
+
|
1120
|
+
Examples:
|
1121
|
+
|
1122
|
+
>>> mixids = mixdb.mixids_for_speech_metadata('speaker_id', 'TIMIT_ARC0')
|
1123
|
+
Get mixids for mixtures with speakers whose speaker_ids are 'TIMIT_ARC0'.
|
1124
|
+
|
1125
|
+
>>> mixids = mixdb.mixids_for_speech_metadata('age', '', lambda x: int(x) < 25)
|
1126
|
+
Get mixids for mixtures with speakers whose ages are less than 25.
|
1127
|
+
|
1128
|
+
>>> mixids = mixdb.mixids_for_speech_metadata('dialect', '', lambda x: x in ['New York City', 'Northern'])
|
1129
|
+
Get mixids for mixtures with speakers whose dialects are either 'New York City' or 'Northern'.
|
1130
|
+
"""
|
1131
|
+
if predicate is None:
|
1132
|
+
def predicate(x: str) -> bool:
|
1133
|
+
return x == value
|
1134
|
+
|
1135
|
+
# First get list of matching target files
|
1136
|
+
target_files = [k for k, v in self._speech_metadata.items() if
|
1137
|
+
isinstance(v.get(tier), str) and predicate(str(v.get(tier)))]
|
1138
|
+
|
1139
|
+
# Next get list of mixids that contain those target files
|
1140
|
+
mixids: list[int] = []
|
1141
|
+
for mixid in self.mixids_to_list():
|
1142
|
+
mixid_target_files = [self.target_file(target.file_id).name for target in self.mixture(mixid).targets]
|
1143
|
+
for mixid_target_file in mixid_target_files:
|
1144
|
+
if mixid_target_file in target_files:
|
1145
|
+
mixids.append(mixid)
|
1146
|
+
|
1147
|
+
# Return sorted, unique list of mixids
|
1148
|
+
return sorted(list(set(mixids)))
|
1149
|
+
|
1150
|
+
def get_speech_metadata(self, mixid: int, tier: str) -> list[SpeechMetadata]:
|
1151
|
+
results: list[SpeechMetadata] = []
|
1152
|
+
for target in self.mixture(mixid).targets:
|
1153
|
+
data = self._speech_metadata[self.target_file(target.file_id).name].get(tier)
|
1154
|
+
|
1155
|
+
if data is None:
|
1156
|
+
results.append(None)
|
1157
|
+
elif isinstance(data, list):
|
1158
|
+
# Check for tempo augmentation and adjust Interval start and end data as needed
|
1159
|
+
entries = []
|
1160
|
+
for entry in data:
|
1161
|
+
if target.augmentation.tempo is not None:
|
1162
|
+
entries.append(Interval(entry.start / target.augmentation.tempo,
|
1163
|
+
entry.end / target.augmentation.tempo,
|
1164
|
+
entry.label))
|
1165
|
+
else:
|
1166
|
+
entries.append(entry)
|
1167
|
+
|
1168
|
+
else:
|
1169
|
+
results.append(data)
|
1170
|
+
|
1171
|
+
return results
|
1172
|
+
|
1072
1173
|
|
1073
1174
|
@lru_cache
|
1074
1175
|
def _spectral_mask(db: partial, sm_id: int) -> SpectralMask:
|
@@ -18,6 +18,11 @@ def _raw_read(name: str) -> tuple[AudioT, int]:
|
|
18
18
|
raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
|
19
19
|
raw = raw / 2 ** (sound.sample_width * 8 - 1)
|
20
20
|
sample_rate = sound.frame_rate
|
21
|
+
elif expanded_name.endswith('.m4a'):
|
22
|
+
sound = AudioSegment.from_file(expanded_name)
|
23
|
+
raw = np.array(sound.get_array_of_samples()).astype(np.float32).reshape((-1, sound.channels))
|
24
|
+
raw = raw / 2 ** (sound.sample_width * 8 - 1)
|
25
|
+
sample_rate = sound.frame_rate
|
21
26
|
else:
|
22
27
|
raw, sample_rate = soundfile.read(expanded_name, always_2d=True, dtype='float32')
|
23
28
|
except Exception as e:
|
@@ -29,6 +34,35 @@ def _raw_read(name: str) -> tuple[AudioT, int]:
|
|
29
34
|
return np.squeeze(raw[:, 0]), sample_rate
|
30
35
|
|
31
36
|
|
37
|
+
def get_sample_rate(name: str) -> int:
|
38
|
+
"""Get sample rate from audio file using soundfile
|
39
|
+
|
40
|
+
:param name: File name
|
41
|
+
:return: Sample rate
|
42
|
+
"""
|
43
|
+
import soundfile
|
44
|
+
from pydub import AudioSegment
|
45
|
+
|
46
|
+
from sonusai import SonusAIError
|
47
|
+
from .tokenized_shell_vars import tokenized_expand
|
48
|
+
|
49
|
+
expanded_name, _ = tokenized_expand(name)
|
50
|
+
|
51
|
+
try:
|
52
|
+
if expanded_name.endswith('.mp3'):
|
53
|
+
return AudioSegment.from_mp3(expanded_name).frame_rate
|
54
|
+
|
55
|
+
if expanded_name.endswith('.m4a'):
|
56
|
+
return AudioSegment.from_file(expanded_name).frame_rate
|
57
|
+
|
58
|
+
return soundfile.info(expanded_name).samplerate
|
59
|
+
except Exception as e:
|
60
|
+
if name != expanded_name:
|
61
|
+
raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}): {e}')
|
62
|
+
else:
|
63
|
+
raise SonusAIError(f'Error reading {name}: {e}')
|
64
|
+
|
65
|
+
|
32
66
|
def read_ir(name: str) -> ImpulseResponseData:
|
33
67
|
"""Read impulse response data using soundfile
|
34
68
|
|
@@ -87,8 +121,13 @@ def get_num_samples(name: str) -> int:
|
|
87
121
|
sound = AudioSegment.from_mp3(expanded_name)
|
88
122
|
samples = sound.frame_count()
|
89
123
|
sample_rate = sound.frame_rate
|
124
|
+
elif expanded_name.endswith('.m4a'):
|
125
|
+
sound = AudioSegment.from_file(expanded_name)
|
126
|
+
samples = sound.frame_count()
|
127
|
+
sample_rate = sound.frame_rate
|
90
128
|
else:
|
91
129
|
info = soundfile.info(name)
|
92
130
|
samples = info.frames
|
93
131
|
sample_rate = info.samplerate
|
132
|
+
|
94
133
|
return math.ceil(SAMPLE_RATE * samples / sample_rate)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
from functools import cached_property
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from praatio import textgrid
|
5
|
+
from praatio.data_classes.textgrid_tier import TextgridTier
|
6
|
+
from praatio.utilities.constants import Interval
|
7
|
+
|
8
|
+
from sonusai.mixture.datatypes import TargetFiles
|
9
|
+
from sonusai.mixture.tokenized_shell_vars import tokenized_expand
|
10
|
+
|
11
|
+
|
12
|
+
class SpeakerMetadata:
|
13
|
+
def __init__(self, target_files: TargetFiles) -> None:
|
14
|
+
self.data: dict[str, dict[str, TextgridTier]] = {}
|
15
|
+
for file in target_files:
|
16
|
+
self.data[file.name] = {}
|
17
|
+
file_name, _ = tokenized_expand(file.name)
|
18
|
+
tg_file = Path(file_name).with_suffix('.TextGrid')
|
19
|
+
if tg_file.exists():
|
20
|
+
tg = textgrid.openTextgrid(str(tg_file), includeEmptyIntervals=False)
|
21
|
+
for tier in tg.tierNames:
|
22
|
+
self.data[file.name][tier] = tg.getTier(tier)
|
23
|
+
|
24
|
+
@cached_property
|
25
|
+
def tiers(self) -> list[str]:
|
26
|
+
return sorted(list(set([key for value in self.data.values() for key in value.keys()])))
|
27
|
+
|
28
|
+
def all(self, tier: str, label_only: bool = False) -> list[Interval]:
|
29
|
+
results = [value[tier].entries for value in self.data.values()]
|
30
|
+
if label_only:
|
31
|
+
return sorted(set([r.label for result in results for r in result]))
|
32
|
+
return results
|
33
|
+
|
34
|
+
def mixids_for(self, tier: str, value: str) -> list[int]:
|
35
|
+
pass
|
@@ -39,6 +39,28 @@ def read_impulse_response(name: str) -> ImpulseResponseData:
|
|
39
39
|
return ImpulseResponseData(name=name, sample_rate=sample_rate, data=data)
|
40
40
|
|
41
41
|
|
42
|
+
def get_sample_rate(name: str) -> int:
|
43
|
+
"""Get sample rate from audio file using torchaudio
|
44
|
+
|
45
|
+
:param name: File name
|
46
|
+
:return: Sample rate
|
47
|
+
"""
|
48
|
+
import torchaudio
|
49
|
+
|
50
|
+
from sonusai import SonusAIError
|
51
|
+
from .tokenized_shell_vars import tokenized_expand
|
52
|
+
|
53
|
+
expanded_name, _ = tokenized_expand(name)
|
54
|
+
|
55
|
+
try:
|
56
|
+
return torchaudio.info(expanded_name).sample_rate
|
57
|
+
except Exception as e:
|
58
|
+
if name != expanded_name:
|
59
|
+
raise SonusAIError(f'Error reading {name} (expanded: {expanded_name}):\n{e}')
|
60
|
+
else:
|
61
|
+
raise SonusAIError(f'Error reading {name}:\n{e}')
|
62
|
+
|
63
|
+
|
42
64
|
def read_audio(name: str) -> AudioT:
|
43
65
|
"""Read audio data from a file using torchaudio
|
44
66
|
|
sonusai/mkmanifest.py
CHANGED
@@ -94,10 +94,10 @@ def main() -> None:
|
|
94
94
|
from sonusai import initial_log_messages
|
95
95
|
from sonusai import logger
|
96
96
|
from sonusai import update_console_handler
|
97
|
+
from sonusai.utils import PathInfo
|
97
98
|
from sonusai.utils import braced_iglob
|
98
99
|
from sonusai.utils import pp_tqdm_imap
|
99
100
|
from sonusai.utils import seconds_to_hms
|
100
|
-
from sonusai.utils.asr_manifest_functions import PathInfo
|
101
101
|
from sonusai.utils.asr_manifest_functions import collect_librispeech_transcripts
|
102
102
|
from sonusai.utils.asr_manifest_functions import collect_vctk_noisy_speech_transcripts
|
103
103
|
from sonusai.utils.asr_manifest_functions import get_librispeech_manifest_entry
|