sonusai 0.17.3__py3-none-any.whl → 0.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/__init__.py +0 -1
- sonusai/calc_metric_spenh.py +74 -45
- sonusai/doc/doc.py +0 -24
- sonusai/genmetrics.py +146 -0
- sonusai/genmixdb.py +0 -2
- sonusai/mixture/__init__.py +0 -1
- sonusai/mixture/constants.py +0 -1
- sonusai/mixture/datatypes.py +2 -9
- sonusai/mixture/db_datatypes.py +72 -0
- sonusai/mixture/generation.py +139 -38
- sonusai/mixture/helpers.py +75 -16
- sonusai/mixture/mapped_snr_f.py +56 -9
- sonusai/mixture/mixdb.py +347 -226
- sonusai/mixture/tokenized_shell_vars.py +8 -1
- sonusai/speech/textgrid.py +6 -24
- {sonusai-0.17.3.dist-info → sonusai-0.18.1.dist-info}/METADATA +3 -1
- {sonusai-0.17.3.dist-info → sonusai-0.18.1.dist-info}/RECORD +19 -24
- sonusai/mixture/speaker_metadata.py +0 -35
- sonusai/mkmanifest.py +0 -209
- sonusai/utils/asr_manifest_functions/__init__.py +0 -6
- sonusai/utils/asr_manifest_functions/data.py +0 -1
- sonusai/utils/asr_manifest_functions/librispeech.py +0 -46
- sonusai/utils/asr_manifest_functions/mcgill_speech.py +0 -29
- sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +0 -66
- {sonusai-0.17.3.dist-info → sonusai-0.18.1.dist-info}/WHEEL +0 -0
- {sonusai-0.17.3.dist-info → sonusai-0.18.1.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,7 @@
|
|
1
|
-
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
|
4
|
+
def tokenized_expand(name: str | bytes | Path) -> tuple[str, dict[str, str]]:
|
2
5
|
"""Expand shell variables of the forms $var, ${var} and %var%.
|
3
6
|
Unknown variables are left unchanged.
|
4
7
|
|
@@ -25,6 +28,9 @@ def tokenized_expand(name: str | bytes) -> tuple[str, dict[str, str]]:
|
|
25
28
|
if isinstance(name, bytes):
|
26
29
|
name = name.decode('utf-8')
|
27
30
|
|
31
|
+
if isinstance(name, Path):
|
32
|
+
name = name.as_posix()
|
33
|
+
|
28
34
|
name = os.fspath(name)
|
29
35
|
token_map: dict = {}
|
30
36
|
|
@@ -121,6 +127,7 @@ def tokenized_expand(name: str | bytes) -> tuple[str, dict[str, str]]:
|
|
121
127
|
else:
|
122
128
|
result += c
|
123
129
|
index += 1
|
130
|
+
|
124
131
|
return result, token_map
|
125
132
|
|
126
133
|
|
sonusai/speech/textgrid.py
CHANGED
@@ -6,37 +6,19 @@ from praatio.utilities.constants import Interval
|
|
6
6
|
from .types import TimeAlignedType
|
7
7
|
|
8
8
|
|
9
|
-
def _get_duration(name: str) -> float:
|
10
|
-
from pydub import AudioSegment
|
11
|
-
|
12
|
-
from sonusai import SonusAIError
|
13
|
-
|
14
|
-
try:
|
15
|
-
return AudioSegment.from_file(name).duration_seconds
|
16
|
-
except Exception as e:
|
17
|
-
raise SonusAIError(f'Error reading {name}: {e}')
|
18
|
-
|
19
|
-
|
20
9
|
def create_textgrid(prompt: Path,
|
21
|
-
speaker_id: str,
|
22
|
-
speaker: dict,
|
23
10
|
output_dir: Path,
|
24
11
|
text: TimeAlignedType = None,
|
25
12
|
words: list[TimeAlignedType] = None,
|
26
13
|
phonemes: list[TimeAlignedType] = None) -> None:
|
27
|
-
if text is
|
28
|
-
|
29
|
-
'text': [text],
|
30
|
-
'words': words})
|
31
|
-
else:
|
32
|
-
min_t = 0
|
33
|
-
max_t = _get_duration(str(prompt))
|
14
|
+
if text is None and words is None and phonemes is None:
|
15
|
+
return
|
34
16
|
|
35
|
-
|
17
|
+
min_t, max_t = _get_min_max({'phonemes': phonemes,
|
18
|
+
'text': [text],
|
19
|
+
'words': words})
|
36
20
|
|
37
|
-
tg
|
38
|
-
for tier in speaker.keys():
|
39
|
-
tg.addTier(textgrid.IntervalTier(tier, [Interval(min_t, max_t, str(speaker[tier]))], min_t, max_t))
|
21
|
+
tg = textgrid.Textgrid()
|
40
22
|
|
41
23
|
if text is not None:
|
42
24
|
entries = [Interval(text.start, text.end, text.text)]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonusai
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.18.1
|
4
4
|
Summary: Framework for building deep neural network models for sound, speech, and voice AI
|
5
5
|
Home-page: https://aaware.com
|
6
6
|
License: GPL-3.0-only
|
@@ -21,6 +21,7 @@ Requires-Dist: h5py (>=3.11.0,<4.0.0)
|
|
21
21
|
Requires-Dist: jiwer (>=3.0.3,<4.0.0)
|
22
22
|
Requires-Dist: librosa (>=0.10.1,<0.11.0)
|
23
23
|
Requires-Dist: matplotlib (>=3.8.0,<4.0.0)
|
24
|
+
Requires-Dist: mgzip (>=0.2.1,<0.3.0)
|
24
25
|
Requires-Dist: numpy (>=1.26.4,<2.0.0)
|
25
26
|
Requires-Dist: onnx (>=1.14.1,<2.0.0)
|
26
27
|
Requires-Dist: onnxruntime (>=1.16.1,<2.0.0)
|
@@ -28,6 +29,7 @@ Requires-Dist: paho-mqtt (>=2.0.0,<3.0.0)
|
|
28
29
|
Requires-Dist: pandas (>=2.1.1,<3.0.0)
|
29
30
|
Requires-Dist: pesq (>=0.0.4,<0.0.5)
|
30
31
|
Requires-Dist: praatio (>=6.2.0,<7.0.0)
|
32
|
+
Requires-Dist: psutil (>=5,<6)
|
31
33
|
Requires-Dist: pyaaware (>=1.5.7,<2.0.0)
|
32
34
|
Requires-Dist: pyaudio (>=0.2.14,<0.3.0)
|
33
35
|
Requires-Dist: pydub (>=0.25.1,<0.26.0)
|
@@ -1,17 +1,18 @@
|
|
1
|
-
sonusai/__init__.py,sha256=
|
1
|
+
sonusai/__init__.py,sha256=j2eH_QUsIIMm0HDiNHC5HCocWsX_GhtvlmTkT7zyYOw,2918
|
2
2
|
sonusai/aawscd_probwrite.py,sha256=GukR5owp_0A3DrqSl9fHWULYgclNft4D5OkHIwfxxkc,3698
|
3
3
|
sonusai/audiofe.py,sha256=zOySiYs5ZZm60eMbA7RjhG6C0Ouhaii3WfL1d0Q8rxg,11154
|
4
|
-
sonusai/calc_metric_spenh.py,sha256=
|
4
|
+
sonusai/calc_metric_spenh.py,sha256=SunJD8wkdUxyL0rRZt2auauZBEUzpi0IRY8MtXKh3wo,63645
|
5
5
|
sonusai/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
sonusai/data/genmixdb.yml,sha256=-XSs_hUR6wHJVoTPmSewzXL7u61X-xmHY46lNPatxSE,1025
|
7
7
|
sonusai/data/speech_ma01_01.wav,sha256=PK0vMKg-NR6rPE3KouxHGF6PKXnJCr7AwjMqfu98LUA,76644
|
8
8
|
sonusai/data/whitenoise.wav,sha256=I2umov0m34y56F9IsIBi1XtE76ZeZaSKDf70cJRe3pI,1920044
|
9
9
|
sonusai/doc/__init__.py,sha256=rP5Hgn0Iys_xkuv4caxngdqehuU4zLZsiKuv8Nde67M,19
|
10
|
-
sonusai/doc/doc.py,sha256=
|
10
|
+
sonusai/doc/doc.py,sha256=LOf4HiIxMeQ7-n0ExDSldo1Tt1036SVaN_auwlqXUxQ,21489
|
11
11
|
sonusai/doc.py,sha256=l8CaFgLI8mqx4tn0aXfxKqa2dy9GgC0zjYxZAkpmi1E,878
|
12
12
|
sonusai/genft.py,sha256=OzET3iTE-QhrUckzidfZvCDXZlAxIF5Xe5NEf856Vvk,5662
|
13
|
+
sonusai/genmetrics.py,sha256=fC8KPIB6wtBj_hs1X65lq3dqNTcWeuNs1eT7yXdpxD8,1830
|
13
14
|
sonusai/genmix.py,sha256=TU5aTebGHsbfwsRbynYbegGBelSma9khuQkDk0dFE3I,7075
|
14
|
-
sonusai/genmixdb.py,sha256=
|
15
|
+
sonusai/genmixdb.py,sha256=gF2qas1tH9MHEFLoEMrN3kYVm-vhAKaOuZ8ev-w4MQM,19553
|
15
16
|
sonusai/gentcst.py,sha256=W1ZO3xs7CoZkFcvOTH-FLJOIA4I7Wzb0HVRC3hGGSaM,20223
|
16
17
|
sonusai/lsdb.py,sha256=fMRqPlAu4B-4MsTXX-NaWXYyJ_dAOJlS-LrvQPQQsXg,6028
|
17
18
|
sonusai/main.py,sha256=GC-pQrSqx9tWwIcmEo6V9SraEv5KskBLS_W_wz-f2ZM,2509
|
@@ -28,28 +29,28 @@ sonusai/metrics/class_summary.py,sha256=4Mb25nuk6eqotnQSFMuOQL3zofGcpNXDfDlPa513
|
|
28
29
|
sonusai/metrics/confusion_matrix_summary.py,sha256=3qg6TMKjJeHtNjj2YnNjPFSlMrQXt0Zcu1dLkGB_aPU,4001
|
29
30
|
sonusai/metrics/one_hot.py,sha256=QSeH_GdqBpOAKLrNnQ8gjcPC-vSdUqC0yPEQueTA6VI,13548
|
30
31
|
sonusai/metrics/snr_summary.py,sha256=P4U5_Xr7v9F8kF-rZBnpsVNt3p42rIVS6zmch8yfVfg,5575
|
31
|
-
sonusai/mixture/__init__.py,sha256=
|
32
|
+
sonusai/mixture/__init__.py,sha256=rFaHyroCf0Fd-SuxmH4dl8xZVjOe8gFjndouv9RtzIE,5347
|
32
33
|
sonusai/mixture/audio.py,sha256=2lqy0DtTMTYhX4aAOIvVtLNn6QB5ivTa7cJIaAlbfAg,2385
|
33
34
|
sonusai/mixture/augmentation.py,sha256=Blb90tdTwBOj5w9tRcYyS5H67YJuFiXsGqwZWd7ON4g,10468
|
34
35
|
sonusai/mixture/class_count.py,sha256=_wFnVl2yEOnbor7pLg7cYOUeX6nioov-03Cv3SEbh2k,996
|
35
36
|
sonusai/mixture/config.py,sha256=d2IzZ1samHWGMpoKzSmUwMyAWWhgmyNoxyO8oiUwbsg,22193
|
36
|
-
sonusai/mixture/constants.py,sha256=
|
37
|
-
sonusai/mixture/datatypes.py,sha256=
|
37
|
+
sonusai/mixture/constants.py,sha256=ZRM7Z8e6EwnL9RwaMVxks-QokN9KMWxnQzAf9VNxa9M,1408
|
38
|
+
sonusai/mixture/datatypes.py,sha256=uVJtT2sVGS531pSglhaLLF5hZcI3_0oKQOWmMkrCwqo,8253
|
39
|
+
sonusai/mixture/db_datatypes.py,sha256=GDYbcSrlgUJsesiUUNnR4s5aBkMgviiNSQDaBcgYX7I,1428
|
38
40
|
sonusai/mixture/eq_rule_is_valid.py,sha256=MpQwRA5M76wSiQWEI1lW2cLFdPaMttBLcQp3tWD8efM,1243
|
39
41
|
sonusai/mixture/feature.py,sha256=Rwuf82IoXzhHPGbKYVGcatImF_ssBf_FfvbqghVPXtg,4116
|
40
|
-
sonusai/mixture/generation.py,sha256=
|
41
|
-
sonusai/mixture/helpers.py,sha256=
|
42
|
+
sonusai/mixture/generation.py,sha256=ohZnhtHIrdQDql2OF703NnhK07Ys-1qAjiwrIql-oMw,42694
|
43
|
+
sonusai/mixture/helpers.py,sha256=eC9ZysEa-83VLKen_9PKWzr8w9dkHj4lp6rMB2fNLbg,24759
|
42
44
|
sonusai/mixture/log_duration_and_sizes.py,sha256=baTUpqyM15wA125jo9E3posmVJUe3WlpksyO6v9Jul0,1347
|
43
|
-
sonusai/mixture/mapped_snr_f.py,sha256=
|
44
|
-
sonusai/mixture/mixdb.py,sha256=
|
45
|
+
sonusai/mixture/mapped_snr_f.py,sha256=Fdf2uw62FvyKvVy5VywaUtPZGO1zCWQsHlte0bwkKPQ,3121
|
46
|
+
sonusai/mixture/mixdb.py,sha256=XSJQKYaUfQylpWbGjfxfP7dendr-9JFcmLWNyE0qUwQ,51697
|
45
47
|
sonusai/mixture/soundfile_audio.py,sha256=mHa5SIXsu_uE0j3DO52GydRJrvWSzU_nII-7YJfQ6Qo,4154
|
46
48
|
sonusai/mixture/sox_audio.py,sha256=HT3kYA9TP5QPCuoOJdUMnGVN-qY6q96DGL8zxuog76o,12277
|
47
49
|
sonusai/mixture/sox_augmentation.py,sha256=kBWPrsFk0EBi71nLcKt5v0GA34bY7g9D9x0cEamNWbU,4564
|
48
|
-
sonusai/mixture/speaker_metadata.py,sha256=l98avdxLYUsSDZ88xUjfvHnACkbnD0_Dg1aBGDbzS9I,1380
|
49
50
|
sonusai/mixture/spectral_mask.py,sha256=8AkCwhy-PSdP1Uri9miKZP-bXFYnFcH_c9xZCGrHavU,2071
|
50
51
|
sonusai/mixture/target_class_balancing.py,sha256=NTNiKZH0_PWLooeow0l41CjJKK8ZTMVbUqz9ZkaNtWk,4900
|
51
52
|
sonusai/mixture/targets.py,sha256=wyy5vhLhuN-hqBMBGoziVvEJg3FKFvJFgmEE7_LaV2M,7908
|
52
|
-
sonusai/mixture/tokenized_shell_vars.py,sha256=
|
53
|
+
sonusai/mixture/tokenized_shell_vars.py,sha256=zIAFvwP2WSvkMAGY7f3SJ4KLXI6IBT-U_e9ptnoo5Hc,4803
|
53
54
|
sonusai/mixture/torchaudio_audio.py,sha256=KhHeOMsjmbwOaAcoKD61aFvYBYSlA8OysfT5iGn45MA,3010
|
54
55
|
sonusai/mixture/torchaudio_augmentation.py,sha256=1vEDHI0caL1vrgoY2lAWe4CiHE2jKRuKKH7x23GHw0w,4390
|
55
56
|
sonusai/mixture/truth.py,sha256=Y41pZ52Xkols9LUler0NlgnilUOscBIucmw4GcxXNzU,1612
|
@@ -61,7 +62,6 @@ sonusai/mixture/truth_functions/file.py,sha256=jOJuC_3y9BH6GGOp9eKcbVrHLVRzUA80B
|
|
61
62
|
sonusai/mixture/truth_functions/phoneme.py,sha256=stYdlPuNytQK_LLT61OJLfYSqKd-sDjQZdtJKGzt5wA,479
|
62
63
|
sonusai/mixture/truth_functions/sed.py,sha256=8cHjEFjZaH_0hIOHhPmj4AJz2GpEADM6Ys2x4NoiWSY,2469
|
63
64
|
sonusai/mixture/truth_functions/target.py,sha256=KAsjugDRooOA5BRcHVAbZRgV7l8S5CFg7CZ0XtKZaQ0,5764
|
64
|
-
sonusai/mkmanifest.py,sha256=imI8swwPYVzumrUYEL-9JLvun-ez98PtlUBj2b729k8,8682
|
65
65
|
sonusai/mkwav.py,sha256=zfSyIiQTIK3KV9Ij33jkLhhZIMVYqaROcRQ4S7c4sIo,5364
|
66
66
|
sonusai/onnx_predict.py,sha256=jSxhD2oFyGSTHOGCXbW4fRT-k4SqKOboK2JaDO-yWcs,8737
|
67
67
|
sonusai/plot.py,sha256=ERkmxMM3qjcCDm4LGDQY4fRAncCYAzP7uW8iZ7_brcg,17105
|
@@ -72,7 +72,7 @@ sonusai/speech/__init__.py,sha256=SuPcU_K9wQISsZRIzsRNLtEC6cb616l-Jlx3PU-HWMs,11
|
|
72
72
|
sonusai/speech/l2arctic.py,sha256=28TT3CohvPu98YNUb8O7rWHAYgPGwYTOLSdfNQjOuyc,3736
|
73
73
|
sonusai/speech/librispeech.py,sha256=A0IpamojCPXyJiHcjCtI7yNWdMjB00cbggjHslssrg8,3120
|
74
74
|
sonusai/speech/mcgill.py,sha256=jcddj64fLdV3sO6CJNafm3w-2SnYoyQtU90odXhdaaE,1976
|
75
|
-
sonusai/speech/textgrid.py,sha256=
|
75
|
+
sonusai/speech/textgrid.py,sha256=MVgpZhoL_ZXffqzqgC4N_EVmVpYP3FaOaNP9iTDUsUU,2722
|
76
76
|
sonusai/speech/timit.py,sha256=1vWgj6isD3ATOjMJSTjOPLmDkYyB65M5MwYipEmLEvg,4081
|
77
77
|
sonusai/speech/types.py,sha256=4eKVPAktpkIrZ2qoVp2iT45zxTVNocQEGT6O_Zlub_w,214
|
78
78
|
sonusai/speech/vctk.py,sha256=EAMEBAzjZUI6dw15n-yI2oCN-H4tzM9t4aUVlOxpAbo,1540
|
@@ -84,11 +84,6 @@ sonusai/utils/asl_p56.py,sha256=-bvQpd-jRQVURbkZJpRoyEAq6gTv9Rc3oFDbh5_lcjY,3861
|
|
84
84
|
sonusai/utils/asr.py,sha256=6y6VYJizHpuQ3MgKbEQ4t2gofO-MW6Ez23oAd6d23IE,2920
|
85
85
|
sonusai/utils/asr_functions/__init__.py,sha256=JyHK67s97bw7QzrlkboWhws4yNytdPatqzLJxfwx-yw,43
|
86
86
|
sonusai/utils/asr_functions/aaware_whisper.py,sha256=Ew3zb8OfbxEW7q1s-KA7D5eph4SjVSUAJgiLK-vVqhI,1985
|
87
|
-
sonusai/utils/asr_manifest_functions/__init__.py,sha256=jfi9xC5c86F_aMSsI5Xj-pxWGxuQ7fwZ8Wdf4T7kDsA,343
|
88
|
-
sonusai/utils/asr_manifest_functions/data.py,sha256=nO4oT3EQmydwn1pzc-ZM09yz4X2ic-LQuHzGEnJhKe8,32
|
89
|
-
sonusai/utils/asr_manifest_functions/librispeech.py,sha256=_3tGc8qfAUpYJZ0_avpW0vGp7zjdpeqj1HAgXi3TL4Q,1612
|
90
|
-
sonusai/utils/asr_manifest_functions/mcgill_speech.py,sha256=dW-5XTC5xOY3PHU2DvlWNWDeoprXDD0Zq2dXDdPAjzE,934
|
91
|
-
sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py,sha256=9iMrnE-qabLMnyewyxsBMl0uCS8yS7BPJOdmUoOnGAc,2146
|
92
87
|
sonusai/utils/audio_devices.py,sha256=LgaXTln1oRArBzaet3rZiIO2plgtaThuGBc3sJ_sLlo,1414
|
93
88
|
sonusai/utils/braced_glob.py,sha256=Z_XIpPK17QiP1JbzAnUC5w3oyG8ZovoyM22Wh-Q_vWU,1675
|
94
89
|
sonusai/utils/calculate_input_shape.py,sha256=63ILxibYKuTQozY83QN8Y2OOhBEbW_1X47Q0askcHDM,984
|
@@ -121,7 +116,7 @@ sonusai/utils/stratified_shuffle_split.py,sha256=rJNXvBp-GxoKzH3OpL7k0ANSu5xMP2z
|
|
121
116
|
sonusai/utils/write_audio.py,sha256=ZsPGExwM86QHLLN2LOWekK2uAqf5pV_1oRW811p0QAI,840
|
122
117
|
sonusai/utils/yes_or_no.py,sha256=eMLXBVH0cEahiXY4W2KNORmwNQ-ba10eRtldh0y4NYg,263
|
123
118
|
sonusai/vars.py,sha256=m2AefF0m5bXWGXpJj8Pi42zWL2ydeEj7bkak3GrtMyM,940
|
124
|
-
sonusai-0.
|
125
|
-
sonusai-0.
|
126
|
-
sonusai-0.
|
127
|
-
sonusai-0.
|
119
|
+
sonusai-0.18.1.dist-info/METADATA,sha256=cLhSYAb5FSCHlrBCG0xA2hwa_4S86JUEWIqY85xA5o8,2591
|
120
|
+
sonusai-0.18.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
121
|
+
sonusai-0.18.1.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
|
122
|
+
sonusai-0.18.1.dist-info/RECORD,,
|
@@ -1,35 +0,0 @@
|
|
1
|
-
from functools import cached_property
|
2
|
-
from pathlib import Path
|
3
|
-
|
4
|
-
from praatio import textgrid
|
5
|
-
from praatio.data_classes.textgrid_tier import TextgridTier
|
6
|
-
from praatio.utilities.constants import Interval
|
7
|
-
|
8
|
-
from sonusai.mixture.datatypes import TargetFiles
|
9
|
-
from sonusai.mixture.tokenized_shell_vars import tokenized_expand
|
10
|
-
|
11
|
-
|
12
|
-
class SpeakerMetadata:
|
13
|
-
def __init__(self, target_files: TargetFiles) -> None:
|
14
|
-
self.data: dict[str, dict[str, TextgridTier]] = {}
|
15
|
-
for file in target_files:
|
16
|
-
self.data[file.name] = {}
|
17
|
-
file_name, _ = tokenized_expand(file.name)
|
18
|
-
tg_file = Path(file_name).with_suffix('.TextGrid')
|
19
|
-
if tg_file.exists():
|
20
|
-
tg = textgrid.openTextgrid(str(tg_file), includeEmptyIntervals=False)
|
21
|
-
for tier in tg.tierNames:
|
22
|
-
self.data[file.name][tier] = tg.getTier(tier)
|
23
|
-
|
24
|
-
@cached_property
|
25
|
-
def tiers(self) -> list[str]:
|
26
|
-
return sorted(list(set([key for value in self.data.values() for key in value.keys()])))
|
27
|
-
|
28
|
-
def all(self, tier: str, label_only: bool = False) -> list[Interval]:
|
29
|
-
results = [value[tier].entries for value in self.data.values()]
|
30
|
-
if label_only:
|
31
|
-
return sorted(set([r.label for result in results for r in result]))
|
32
|
-
return results
|
33
|
-
|
34
|
-
def mixids_for(self, tier: str, value: str) -> list[int]:
|
35
|
-
pass
|
sonusai/mkmanifest.py
DELETED
@@ -1,209 +0,0 @@
|
|
1
|
-
"""mkmanifest
|
2
|
-
|
3
|
-
usage: mkmanifest [-hvn] [--include GLOB] [-m METHOD] [-e ADAT] [-o OUTPUT] PATH ...
|
4
|
-
|
5
|
-
options:
|
6
|
-
-h, --help
|
7
|
-
-v, --verbose Be verbose: list all files found.
|
8
|
-
-n, --dry-run Collect files, but exit without processing and writing manifest file.
|
9
|
-
--include GLOB Search only files whose base name matches GLOB. [default: *.{wav,flac}].
|
10
|
-
-m METHOD, --method METHOD Method for getting the true speech text of the audio files. [default: librispeech].
|
11
|
-
-e ADAT, --audio-env ADAT Environment variable pointing to all audio data.
|
12
|
-
-o OUTPUT, --output OUTPUT Output file name. [default: asr_manifest.json].
|
13
|
-
|
14
|
-
Make a speech recognition (ASR) .json manifest file of all audio files under PATHS following the NVIDIA NeMo format.
|
15
|
-
An example of manifest entries:
|
16
|
-
|
17
|
-
{"audio_filepath": "<absolute_path_to>/1355-39947-0000.wav", "duration": 11.3, "text": "psychotherapy ..."}
|
18
|
-
{"audio_filepath": "<absolute_path_to>/1355-39947-0001.wav", "duration": 15.905, "text": "it is an ..."}
|
19
|
-
|
20
|
-
See the NVIDIA NeMo docs for more information:
|
21
|
-
https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/datasets.html
|
22
|
-
|
23
|
-
Inputs:
|
24
|
-
PATH A relative path name or list of paths containing audio files. Each will be
|
25
|
-
recursively searched for files matching the pattern GLOB.
|
26
|
-
GLOB Match the pattern GLOB using wildcard matching.
|
27
|
-
Example: '*.{wav,flac}' matches all .wav and .flac files.
|
28
|
-
METHOD The method to use for fetching the true speech of the audio files.
|
29
|
-
Supported methods:
|
30
|
-
- 'librispeech'
|
31
|
-
- 'vctk_noisy_speech' expects subdirs named like <name>_wav/ and <name>_txt/ with files in
|
32
|
-
each using same basename, but with .wav and .txt respectively.
|
33
|
-
- 'mcgill-speech' expects audio data in basename/speakerid/speakerid-promptid.wav and
|
34
|
-
transcript data in Scripts/HarvardLists.dat
|
35
|
-
ADAT Audio data environment variable. All found files will be expanded to their full, absolute path and
|
36
|
-
then parts of the path that match the specified environment variable value will be replaced with
|
37
|
-
the variable. This accommodates portability across platforms where the sound datasets may in
|
38
|
-
different locations.
|
39
|
-
OUTPUT Name of output file. Default is asr_manifest.json.
|
40
|
-
|
41
|
-
Outputs the following to the current directory:
|
42
|
-
<OUTPUT>
|
43
|
-
mkmanifest.log
|
44
|
-
|
45
|
-
Example usage for LibriSpeech:
|
46
|
-
sonusai mkmanifest -mlibrispeech -eADAT -oasr_manifest.json --include='*.flac' train-clean-100
|
47
|
-
sonusai mkmanifest -m mcgill-speech -e ADAT -o asr_manifest_16k.json 16k-LP7/
|
48
|
-
"""
|
49
|
-
import signal
|
50
|
-
|
51
|
-
|
52
|
-
def signal_handler(_sig, _frame):
|
53
|
-
import sys
|
54
|
-
|
55
|
-
from sonusai import logger
|
56
|
-
|
57
|
-
logger.info('Canceled due to keyboard interrupt')
|
58
|
-
sys.exit(1)
|
59
|
-
|
60
|
-
|
61
|
-
signal.signal(signal.SIGINT, signal_handler)
|
62
|
-
|
63
|
-
VALID_METHOD = ['librispeech', 'vctk_noisy_speech', 'mcgill-speech']
|
64
|
-
|
65
|
-
|
66
|
-
def main() -> None:
|
67
|
-
from docopt import docopt
|
68
|
-
|
69
|
-
import sonusai
|
70
|
-
from sonusai.utils import trim_docstring
|
71
|
-
|
72
|
-
args = docopt(trim_docstring(__doc__), version=sonusai.__version__, options_first=True)
|
73
|
-
|
74
|
-
verbose = args['--verbose']
|
75
|
-
dry_run = args['--dry-run']
|
76
|
-
include = args['--include']
|
77
|
-
method = args['--method']
|
78
|
-
audio_env = args['--audio-env']
|
79
|
-
output = args['--output']
|
80
|
-
paths = args['PATH']
|
81
|
-
|
82
|
-
import json
|
83
|
-
from functools import partial
|
84
|
-
import time
|
85
|
-
from os import environ
|
86
|
-
from os.path import abspath
|
87
|
-
from os.path import join
|
88
|
-
from os.path import realpath
|
89
|
-
|
90
|
-
from tqdm import tqdm
|
91
|
-
|
92
|
-
from sonusai import SonusAIError
|
93
|
-
from sonusai import create_file_handler
|
94
|
-
from sonusai import initial_log_messages
|
95
|
-
from sonusai import logger
|
96
|
-
from sonusai import update_console_handler
|
97
|
-
from sonusai.utils import PathInfo
|
98
|
-
from sonusai.utils import braced_iglob
|
99
|
-
from sonusai.utils import pp_tqdm_imap
|
100
|
-
from sonusai.utils import seconds_to_hms
|
101
|
-
from sonusai.utils.asr_manifest_functions import collect_librispeech_transcripts
|
102
|
-
from sonusai.utils.asr_manifest_functions import collect_vctk_noisy_speech_transcripts
|
103
|
-
from sonusai.utils.asr_manifest_functions import get_librispeech_manifest_entry
|
104
|
-
from sonusai.utils.asr_manifest_functions import get_vctk_noisy_speech_manifest_entry
|
105
|
-
from sonusai.utils.asr_manifest_functions import get_mcgill_speech_manifest_entry
|
106
|
-
|
107
|
-
start_time = time.monotonic()
|
108
|
-
|
109
|
-
create_file_handler('mkmanifest.log')
|
110
|
-
update_console_handler(verbose)
|
111
|
-
initial_log_messages('mkmanifest')
|
112
|
-
|
113
|
-
if method not in VALID_METHOD:
|
114
|
-
raise SonusAIError(f'Unknown method: {method}')
|
115
|
-
|
116
|
-
audio_dir = None
|
117
|
-
if audio_env is not None:
|
118
|
-
audio_dir = realpath(environ[audio_env])
|
119
|
-
if audio_dir is None:
|
120
|
-
raise SonusAIError(f'Unknown environment variable: {audio_env}')
|
121
|
-
|
122
|
-
if audio_env:
|
123
|
-
for p in paths:
|
124
|
-
if not realpath(abspath(p)).startswith(audio_dir):
|
125
|
-
logger.warning(f'Specified directory, {p}, is not part of the provided audio environment: '
|
126
|
-
f'${audio_env}={audio_dir}')
|
127
|
-
|
128
|
-
logger.info('')
|
129
|
-
logger.info(f'Searching {len(paths)} provided director{"ies" if len(paths) > 1 else "y"}...')
|
130
|
-
|
131
|
-
entries: list[PathInfo] = []
|
132
|
-
for p in paths:
|
133
|
-
location = join(realpath(abspath(p)), '**', include)
|
134
|
-
logger.debug(f'Processing {location}')
|
135
|
-
for file in braced_iglob(pathname=location, recursive=True):
|
136
|
-
name = file
|
137
|
-
if audio_env is not None:
|
138
|
-
name = name.replace(audio_dir, f'${audio_env}')
|
139
|
-
entries.append(PathInfo(abs_path=file, audio_filepath=name))
|
140
|
-
logger.debug('')
|
141
|
-
|
142
|
-
logger.info(f'Found {len(entries)} audio file{"s" if len(entries) != 1 else ""}')
|
143
|
-
|
144
|
-
if dry_run:
|
145
|
-
logger.info('')
|
146
|
-
logger.info('Dry run')
|
147
|
-
logger.info('')
|
148
|
-
for entry in entries:
|
149
|
-
logger.info(f' - {entry.audio_filepath}')
|
150
|
-
return
|
151
|
-
|
152
|
-
if method == 'librispeech':
|
153
|
-
logger.info('Collecting LibriSpeech transcript data')
|
154
|
-
transcript_data = collect_librispeech_transcripts(paths=paths)
|
155
|
-
|
156
|
-
processing_func = partial(get_librispeech_manifest_entry, transcript_data=transcript_data)
|
157
|
-
progress = tqdm(total=len(entries), desc='Creating LibriSpeech manifest data')
|
158
|
-
results = pp_tqdm_imap(processing_func, entries, progress=progress)
|
159
|
-
progress.close()
|
160
|
-
|
161
|
-
with open(output, 'w') as f:
|
162
|
-
for result in results:
|
163
|
-
f.write(json.dumps(result) + '\n')
|
164
|
-
|
165
|
-
if method == 'vctk_noisy_speech':
|
166
|
-
logger.info('Collecting VCTK Noisy Speech transcript data')
|
167
|
-
transcript_data = collect_vctk_noisy_speech_transcripts(paths=paths)
|
168
|
-
|
169
|
-
processing_func = partial(get_vctk_noisy_speech_manifest_entry, transcript_data=transcript_data)
|
170
|
-
progress = tqdm(total=len(entries), desc='Creating VCTK Noisy Speech manifest data')
|
171
|
-
results = pp_tqdm_imap(processing_func, entries, progress=progress)
|
172
|
-
progress.close()
|
173
|
-
|
174
|
-
with open(output, 'w') as f:
|
175
|
-
for result in results:
|
176
|
-
f.write(json.dumps(result) + '\n')
|
177
|
-
|
178
|
-
if method == 'mcgill-speech':
|
179
|
-
logger.info(f'Found {len(entries)} Mcgill Speech files, opening prompt file ...')
|
180
|
-
# Note expecting only one path pointing to data subdir
|
181
|
-
if len(paths) != 1:
|
182
|
-
raise SonusAIError(f'mcgill-speech only support a single path')
|
183
|
-
prompt_fpath = join(join(realpath(abspath(paths[0]))), '../Scripts/HarvardList.dat')
|
184
|
-
with open(prompt_fpath, encoding='utf-8') as f:
|
185
|
-
lines = f.readlines()
|
186
|
-
|
187
|
-
logger.info(f'Found {len(lines) - 4} entries in prompt file.')
|
188
|
-
# First 4 lines are header stuff, can use remaining directly with simple lookup
|
189
|
-
# example line: '01_02:Glue the sheet ...\n' (paragraph 1, sentence 2)
|
190
|
-
# 11 entries per group, so getting line is 11*(p1-1)+(s2-1)
|
191
|
-
lines = lines[4:]
|
192
|
-
|
193
|
-
processing_func = partial(get_mcgill_speech_manifest_entry, transcript_data=lines)
|
194
|
-
progress = tqdm(total=len(entries), desc='Creating Mcgill Speech manifest data')
|
195
|
-
results = pp_tqdm_imap(processing_func, entries, progress=progress)
|
196
|
-
progress.close()
|
197
|
-
|
198
|
-
with open(output, 'w') as f:
|
199
|
-
for result in results:
|
200
|
-
f.write(json.dumps(result) + '\n')
|
201
|
-
|
202
|
-
end_time = time.monotonic()
|
203
|
-
logger.info('')
|
204
|
-
logger.info(f'Completed in {seconds_to_hms(seconds=end_time - start_time)}')
|
205
|
-
logger.info('')
|
206
|
-
|
207
|
-
|
208
|
-
if __name__ == '__main__':
|
209
|
-
main()
|
@@ -1,6 +0,0 @@
|
|
1
|
-
from .data import TranscriptData
|
2
|
-
from .librispeech import collect_librispeech_transcripts
|
3
|
-
from .librispeech import get_librispeech_manifest_entry
|
4
|
-
from .vctk_noisy_speech import collect_vctk_noisy_speech_transcripts
|
5
|
-
from .vctk_noisy_speech import get_vctk_noisy_speech_manifest_entry
|
6
|
-
from .mcgill_speech import get_mcgill_speech_manifest_entry
|
@@ -1 +0,0 @@
|
|
1
|
-
TranscriptData = dict[str, str]
|
@@ -1,46 +0,0 @@
|
|
1
|
-
from sonusai.utils import PathInfo
|
2
|
-
from sonusai.utils.asr_manifest_functions import TranscriptData
|
3
|
-
|
4
|
-
|
5
|
-
def collect_librispeech_transcripts(paths: list[str] | str) -> TranscriptData:
|
6
|
-
from glob import iglob
|
7
|
-
from os.path import abspath
|
8
|
-
from os.path import dirname
|
9
|
-
from os.path import join
|
10
|
-
|
11
|
-
from sonusai import SonusAIError
|
12
|
-
|
13
|
-
entries: TranscriptData = {}
|
14
|
-
if not isinstance(paths, list):
|
15
|
-
paths = [paths]
|
16
|
-
|
17
|
-
for p in paths:
|
18
|
-
location = join(abspath(p), '**', '*.trans.txt')
|
19
|
-
for file in iglob(pathname=location, recursive=True):
|
20
|
-
root = dirname(file)
|
21
|
-
with open(file, encoding='utf-8') as f:
|
22
|
-
for line in f:
|
23
|
-
name, text = line[: line.index(' ')], line[line.index(' ') + 1:]
|
24
|
-
name = join(root, name)
|
25
|
-
if name in entries:
|
26
|
-
raise SonusAIError(f'{name} already exists in transcript data')
|
27
|
-
entries[name] = text.lower().strip()
|
28
|
-
return entries
|
29
|
-
|
30
|
-
|
31
|
-
def get_librispeech_manifest_entry(entry: PathInfo, transcript_data: TranscriptData) -> dict:
|
32
|
-
from os.path import splitext
|
33
|
-
from subprocess import check_output
|
34
|
-
|
35
|
-
from sonusai import SonusAIError
|
36
|
-
|
37
|
-
name = splitext(entry.abs_path)[0]
|
38
|
-
duration = float(check_output(f'soxi -D {entry.abs_path}', shell=True))
|
39
|
-
if name not in transcript_data.keys():
|
40
|
-
raise SonusAIError(f'Could not find {name} in transcript data')
|
41
|
-
|
42
|
-
return {
|
43
|
-
'audio_filepath': entry.audio_filepath,
|
44
|
-
'text': transcript_data[name],
|
45
|
-
'duration': duration,
|
46
|
-
}
|
@@ -1,29 +0,0 @@
|
|
1
|
-
from sonusai.utils import PathInfo
|
2
|
-
|
3
|
-
|
4
|
-
def get_mcgill_speech_manifest_entry(entry: PathInfo, transcript_data: list[str]) -> dict:
|
5
|
-
from os.path import splitext
|
6
|
-
from os.path import basename
|
7
|
-
from subprocess import check_output
|
8
|
-
|
9
|
-
from sonusai import SonusAIError
|
10
|
-
|
11
|
-
name = splitext(entry.abs_path)[0]
|
12
|
-
duration = float(check_output(f'soxi -D {entry.abs_path}', shell=True))
|
13
|
-
# i.e., from MA01_02.wav, get 01_02
|
14
|
-
promptname = basename(name)[2:]
|
15
|
-
# paragraph num
|
16
|
-
pnum = int(promptname[0:2])
|
17
|
-
snum = int(promptname[3:5])
|
18
|
-
idx = 11 * (pnum - 1) + (snum - 1)
|
19
|
-
try:
|
20
|
-
# remove prompt-id prefix and \n suffix
|
21
|
-
text = transcript_data[idx][6:-1]
|
22
|
-
except IndexError:
|
23
|
-
raise SonusAIError(f'Could not find {promptname}, idx {idx} in transcript data')
|
24
|
-
|
25
|
-
return {
|
26
|
-
'audio_filepath': entry.audio_filepath,
|
27
|
-
'text': text,
|
28
|
-
'duration': duration,
|
29
|
-
}
|
@@ -1,66 +0,0 @@
|
|
1
|
-
from sonusai.utils import PathInfo
|
2
|
-
from sonusai.utils.asr_manifest_functions import TranscriptData
|
3
|
-
|
4
|
-
|
5
|
-
def collect_vctk_noisy_speech_transcripts(paths: list[str] | str) -> TranscriptData:
|
6
|
-
from glob import iglob
|
7
|
-
from os import listdir
|
8
|
-
from os.path import abspath
|
9
|
-
from os.path import basename
|
10
|
-
from os.path import join
|
11
|
-
from os.path import split
|
12
|
-
from os.path import splitext
|
13
|
-
|
14
|
-
from sonusai import SonusAIError
|
15
|
-
|
16
|
-
entries: TranscriptData = {}
|
17
|
-
if not isinstance(paths, list):
|
18
|
-
paths = [paths]
|
19
|
-
|
20
|
-
for p in paths:
|
21
|
-
abs_p = abspath(p)
|
22
|
-
head, tail = split(abs_p)
|
23
|
-
|
24
|
-
dirs = listdir(head)
|
25
|
-
tail = tail.replace('wav', 'txt')
|
26
|
-
|
27
|
-
location = None
|
28
|
-
for d in dirs:
|
29
|
-
if tail.endswith(d):
|
30
|
-
location = join(head, d, '*.txt')
|
31
|
-
break
|
32
|
-
if location is None:
|
33
|
-
raise SonusAIError(f'Could not find VCTK Noisy Speech transcript data for {p}')
|
34
|
-
|
35
|
-
for file in iglob(pathname=location, recursive=True):
|
36
|
-
with open(file, encoding='utf-8') as f:
|
37
|
-
lines = f.readlines()
|
38
|
-
if len(lines) != 1:
|
39
|
-
raise SonusAIError(f'Ill-formed VCTK Noisy Speech transcript file: {file}')
|
40
|
-
|
41
|
-
name = join(abs_p, splitext(basename(file))[0])
|
42
|
-
text = lines[0].lower().strip()
|
43
|
-
|
44
|
-
if name in entries:
|
45
|
-
raise SonusAIError(f'{name} already exists in transcript data')
|
46
|
-
entries[name] = text.lower().strip()
|
47
|
-
|
48
|
-
return entries
|
49
|
-
|
50
|
-
|
51
|
-
def get_vctk_noisy_speech_manifest_entry(entry: PathInfo, transcript_data: TranscriptData) -> dict:
|
52
|
-
from os.path import splitext
|
53
|
-
from subprocess import check_output
|
54
|
-
|
55
|
-
from sonusai import SonusAIError
|
56
|
-
|
57
|
-
name = splitext(entry.abs_path)[0]
|
58
|
-
duration = float(check_output(f'soxi -D {entry.abs_path}', shell=True))
|
59
|
-
if name not in transcript_data.keys():
|
60
|
-
raise SonusAIError(f'Could not find {name} in transcript data')
|
61
|
-
|
62
|
-
return {
|
63
|
-
'audio_filepath': entry.audio_filepath,
|
64
|
-
'text': transcript_data[name],
|
65
|
-
'duration': duration,
|
66
|
-
}
|
File without changes
|
File without changes
|