sonusai 0.17.2__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. sonusai/__init__.py +0 -1
  2. sonusai/audiofe.py +3 -3
  3. sonusai/calc_metric_spenh.py +81 -52
  4. sonusai/doc/doc.py +0 -24
  5. sonusai/genmetrics.py +146 -0
  6. sonusai/genmixdb.py +0 -2
  7. sonusai/mixture/__init__.py +0 -1
  8. sonusai/mixture/constants.py +0 -1
  9. sonusai/mixture/datatypes.py +2 -9
  10. sonusai/mixture/generation.py +136 -38
  11. sonusai/mixture/helpers.py +58 -1
  12. sonusai/mixture/mapped_snr_f.py +56 -9
  13. sonusai/mixture/mixdb.py +293 -170
  14. sonusai/mixture/sox_augmentation.py +3 -0
  15. sonusai/mixture/tokenized_shell_vars.py +8 -1
  16. sonusai/mkwav.py +4 -4
  17. sonusai/onnx_predict.py +2 -2
  18. sonusai/post_spenh_targetf.py +2 -2
  19. sonusai/speech/textgrid.py +6 -24
  20. sonusai/speech/{voxceleb2.py → voxceleb.py} +19 -3
  21. sonusai/utils/__init__.py +1 -1
  22. sonusai/utils/asr_functions/aaware_whisper.py +2 -2
  23. sonusai/utils/{wave.py → write_audio.py} +2 -2
  24. {sonusai-0.17.2.dist-info → sonusai-0.18.0.dist-info}/METADATA +4 -1
  25. {sonusai-0.17.2.dist-info → sonusai-0.18.0.dist-info}/RECORD +27 -33
  26. sonusai/mixture/speaker_metadata.py +0 -35
  27. sonusai/mkmanifest.py +0 -209
  28. sonusai/utils/asr_manifest_functions/__init__.py +0 -6
  29. sonusai/utils/asr_manifest_functions/data.py +0 -1
  30. sonusai/utils/asr_manifest_functions/librispeech.py +0 -46
  31. sonusai/utils/asr_manifest_functions/mcgill_speech.py +0 -29
  32. sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +0 -66
  33. {sonusai-0.17.2.dist-info → sonusai-0.18.0.dist-info}/WHEEL +0 -0
  34. {sonusai-0.17.2.dist-info → sonusai-0.18.0.dist-info}/entry_points.txt +0 -0
@@ -137,7 +137,7 @@ def _process(file: str) -> None:
137
137
  from sonusai.mixture import get_audio_from_transform
138
138
  from sonusai.utils import float_to_int16
139
139
  from sonusai.utils import unstack_complex
140
- from sonusai.utils import write_wav
140
+ from sonusai.utils import write_audio
141
141
 
142
142
  try:
143
143
  with h5py.File(file, 'r') as f:
@@ -153,7 +153,7 @@ def _process(file: str) -> None:
153
153
  bin_end=MP_GLOBAL.bin_end,
154
154
  ttype=MP_GLOBAL.ttype,
155
155
  gain=np.float32(1)))
156
- write_wav(name=output_name, audio=float_to_int16(audio))
156
+ write_audio(name=output_name, audio=float_to_int16(audio))
157
157
 
158
158
 
159
159
  if __name__ == '__main__':
@@ -6,37 +6,19 @@ from praatio.utilities.constants import Interval
6
6
  from .types import TimeAlignedType
7
7
 
8
8
 
9
- def _get_duration(name: str) -> float:
10
- from pydub import AudioSegment
11
-
12
- from sonusai import SonusAIError
13
-
14
- try:
15
- return AudioSegment.from_file(name).duration_seconds
16
- except Exception as e:
17
- raise SonusAIError(f'Error reading {name}: {e}')
18
-
19
-
20
9
  def create_textgrid(prompt: Path,
21
- speaker_id: str,
22
- speaker: dict,
23
10
  output_dir: Path,
24
11
  text: TimeAlignedType = None,
25
12
  words: list[TimeAlignedType] = None,
26
13
  phonemes: list[TimeAlignedType] = None) -> None:
27
- if text is not None or words is not None or phonemes is not None:
28
- min_t, max_t = _get_min_max({'phonemes': phonemes,
29
- 'text': [text],
30
- 'words': words})
31
- else:
32
- min_t = 0
33
- max_t = _get_duration(str(prompt))
14
+ if text is None and words is None and phonemes is None:
15
+ return
34
16
 
35
- tg = textgrid.Textgrid()
17
+ min_t, max_t = _get_min_max({'phonemes': phonemes,
18
+ 'text': [text],
19
+ 'words': words})
36
20
 
37
- tg.addTier(textgrid.IntervalTier('speaker_id', [Interval(min_t, max_t, speaker_id)], min_t, max_t))
38
- for tier in speaker.keys():
39
- tg.addTier(textgrid.IntervalTier(tier, [Interval(min_t, max_t, str(speaker[tier]))], min_t, max_t))
21
+ tg = textgrid.Textgrid()
40
22
 
41
23
  if text is not None:
42
24
  entries = [Interval(text.start, text.end, text.text)]
@@ -16,14 +16,30 @@ def load_speakers(input_dir: Path) -> dict:
16
16
  import csv
17
17
 
18
18
  speakers = {}
19
+
20
+ # VoxCeleb1
21
+ first = True
22
+ with open(input_dir / 'vox1_meta.csv', newline='') as file:
23
+ data = csv.reader(file, delimiter='\t')
24
+ for row in data:
25
+ if first:
26
+ first = False
27
+ else:
28
+ speakers[row[0].strip()] = {'gender': row[2].strip(),
29
+ 'dialect': row[3].strip(),
30
+ 'category': row[4].strip()}
31
+
32
+ # VoxCeleb2
19
33
  first = True
20
- with open(input_dir / 'vox2_meta_cleansed.csv', newline='') as file:
21
- data = csv.reader(file)
34
+ with open(input_dir / 'vox2_meta.csv', newline='') as file:
35
+ data = csv.reader(file, delimiter='\t')
22
36
  for row in data:
23
37
  if first:
24
38
  first = False
25
39
  else:
26
- speakers[row[0].strip()] = {'gender': row[2].strip(), 'category': row[3].strip()}
40
+ speakers[row[1].strip()] = {'gender': row[3].strip(),
41
+ 'category': row[4].strip()}
42
+
27
43
  return speakers
28
44
 
29
45
 
sonusai/utils/__init__.py CHANGED
@@ -49,5 +49,5 @@ from .stacked_complex import stacked_complex_imag
49
49
  from .stacked_complex import stacked_complex_real
50
50
  from .stacked_complex import unstack_complex
51
51
  from .stratified_shuffle_split import stratified_shuffle_split_mixid
52
- from .wave import write_wav
52
+ from .write_audio import write_audio
53
53
  from .yes_or_no import yes_or_no
@@ -13,7 +13,7 @@ def aaware_whisper(data: ASRData) -> ASRResult:
13
13
  from sonusai import SonusAIError
14
14
  from sonusai.utils import ASRResult
15
15
  from sonusai.utils import float_to_int16
16
- from sonusai.utils import write_wav
16
+ from sonusai.utils import write_audio
17
17
 
18
18
  url = getenv('AAWARE_WHISPER_URL')
19
19
  if url is None:
@@ -22,7 +22,7 @@ def aaware_whisper(data: ASRData) -> ASRResult:
22
22
 
23
23
  with tempfile.TemporaryDirectory() as tmp:
24
24
  file = join(tmp, 'asr.wav')
25
- write_wav(name=file, audio=float_to_int16(data.audio))
25
+ write_audio(name=file, audio=float_to_int16(data.audio))
26
26
 
27
27
  files = {'audio_file': (file, open(file, 'rb'), 'audio/wav')}
28
28
 
@@ -2,8 +2,8 @@ from sonusai.mixture.constants import SAMPLE_RATE
2
2
  from sonusai.mixture.datatypes import AudioT
3
3
 
4
4
 
5
- def write_wav(name: str, audio: AudioT, sample_rate: int = SAMPLE_RATE) -> None:
6
- """ Write a simple, uncompressed WAV file.
5
+ def write_audio(name: str, audio: AudioT, sample_rate: int = SAMPLE_RATE) -> None:
6
+ """ Write an audio file.
7
7
 
8
8
  To write multiple channels, use a 2D array of shape [channels, samples].
9
9
  The bits per sample and PCM/float are determined by the data type.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonusai
3
- Version: 0.17.2
3
+ Version: 0.18.0
4
4
  Summary: Framework for building deep neural network models for sound, speech, and voice AI
5
5
  Home-page: https://aaware.com
6
6
  License: GPL-3.0-only
@@ -21,12 +21,15 @@ Requires-Dist: h5py (>=3.11.0,<4.0.0)
21
21
  Requires-Dist: jiwer (>=3.0.3,<4.0.0)
22
22
  Requires-Dist: librosa (>=0.10.1,<0.11.0)
23
23
  Requires-Dist: matplotlib (>=3.8.0,<4.0.0)
24
+ Requires-Dist: mgzip (>=0.2.1,<0.3.0)
25
+ Requires-Dist: numpy (>=1.26.4,<2.0.0)
24
26
  Requires-Dist: onnx (>=1.14.1,<2.0.0)
25
27
  Requires-Dist: onnxruntime (>=1.16.1,<2.0.0)
26
28
  Requires-Dist: paho-mqtt (>=2.0.0,<3.0.0)
27
29
  Requires-Dist: pandas (>=2.1.1,<3.0.0)
28
30
  Requires-Dist: pesq (>=0.0.4,<0.0.5)
29
31
  Requires-Dist: praatio (>=6.2.0,<7.0.0)
32
+ Requires-Dist: psutil (>=5,<6)
30
33
  Requires-Dist: pyaaware (>=1.5.7,<2.0.0)
31
34
  Requires-Dist: pyaudio (>=0.2.14,<0.3.0)
32
35
  Requires-Dist: pydub (>=0.25.1,<0.26.0)
@@ -1,17 +1,18 @@
1
- sonusai/__init__.py,sha256=vzTFfRB-NeO-Sm3puySDJOybk3ND_Oj6w0EejQPmH1U,2978
1
+ sonusai/__init__.py,sha256=j2eH_QUsIIMm0HDiNHC5HCocWsX_GhtvlmTkT7zyYOw,2918
2
2
  sonusai/aawscd_probwrite.py,sha256=GukR5owp_0A3DrqSl9fHWULYgclNft4D5OkHIwfxxkc,3698
3
- sonusai/audiofe.py,sha256=AHXV7fQKumkwUSbOS-ZU6Cp1VF88DRtqt7foVbf-Nh8,11148
4
- sonusai/calc_metric_spenh.py,sha256=Xgy9EKbZRPAydjTZbpZjaqLBNkjQPjDmSbfL8PbVSgY,62157
3
+ sonusai/audiofe.py,sha256=zOySiYs5ZZm60eMbA7RjhG6C0Ouhaii3WfL1d0Q8rxg,11154
4
+ sonusai/calc_metric_spenh.py,sha256=SunJD8wkdUxyL0rRZt2auauZBEUzpi0IRY8MtXKh3wo,63645
5
5
  sonusai/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  sonusai/data/genmixdb.yml,sha256=-XSs_hUR6wHJVoTPmSewzXL7u61X-xmHY46lNPatxSE,1025
7
7
  sonusai/data/speech_ma01_01.wav,sha256=PK0vMKg-NR6rPE3KouxHGF6PKXnJCr7AwjMqfu98LUA,76644
8
8
  sonusai/data/whitenoise.wav,sha256=I2umov0m34y56F9IsIBi1XtE76ZeZaSKDf70cJRe3pI,1920044
9
9
  sonusai/doc/__init__.py,sha256=rP5Hgn0Iys_xkuv4caxngdqehuU4zLZsiKuv8Nde67M,19
10
- sonusai/doc/doc.py,sha256=4NEZ2K-hTk7Y1Gxx09UEjNhiYkD9xid-kJ1Nt8H5_gM,22670
10
+ sonusai/doc/doc.py,sha256=LOf4HiIxMeQ7-n0ExDSldo1Tt1036SVaN_auwlqXUxQ,21489
11
11
  sonusai/doc.py,sha256=l8CaFgLI8mqx4tn0aXfxKqa2dy9GgC0zjYxZAkpmi1E,878
12
12
  sonusai/genft.py,sha256=OzET3iTE-QhrUckzidfZvCDXZlAxIF5Xe5NEf856Vvk,5662
13
+ sonusai/genmetrics.py,sha256=fC8KPIB6wtBj_hs1X65lq3dqNTcWeuNs1eT7yXdpxD8,1830
13
14
  sonusai/genmix.py,sha256=TU5aTebGHsbfwsRbynYbegGBelSma9khuQkDk0dFE3I,7075
14
- sonusai/genmixdb.py,sha256=M67Y_SEysgHfTmHHOdOjxdpuryTMDNgbDteCzR1uLk8,19669
15
+ sonusai/genmixdb.py,sha256=gF2qas1tH9MHEFLoEMrN3kYVm-vhAKaOuZ8ev-w4MQM,19553
15
16
  sonusai/gentcst.py,sha256=W1ZO3xs7CoZkFcvOTH-FLJOIA4I7Wzb0HVRC3hGGSaM,20223
16
17
  sonusai/lsdb.py,sha256=fMRqPlAu4B-4MsTXX-NaWXYyJ_dAOJlS-LrvQPQQsXg,6028
17
18
  sonusai/main.py,sha256=GC-pQrSqx9tWwIcmEo6V9SraEv5KskBLS_W_wz-f2ZM,2509
@@ -28,28 +29,27 @@ sonusai/metrics/class_summary.py,sha256=4Mb25nuk6eqotnQSFMuOQL3zofGcpNXDfDlPa513
28
29
  sonusai/metrics/confusion_matrix_summary.py,sha256=3qg6TMKjJeHtNjj2YnNjPFSlMrQXt0Zcu1dLkGB_aPU,4001
29
30
  sonusai/metrics/one_hot.py,sha256=QSeH_GdqBpOAKLrNnQ8gjcPC-vSdUqC0yPEQueTA6VI,13548
30
31
  sonusai/metrics/snr_summary.py,sha256=P4U5_Xr7v9F8kF-rZBnpsVNt3p42rIVS6zmch8yfVfg,5575
31
- sonusai/mixture/__init__.py,sha256=yszEbRnlxeZXSegEBUVwyrSZwNIl6ufaJu_NiZ-1rqY,5399
32
+ sonusai/mixture/__init__.py,sha256=rFaHyroCf0Fd-SuxmH4dl8xZVjOe8gFjndouv9RtzIE,5347
32
33
  sonusai/mixture/audio.py,sha256=2lqy0DtTMTYhX4aAOIvVtLNn6QB5ivTa7cJIaAlbfAg,2385
33
34
  sonusai/mixture/augmentation.py,sha256=Blb90tdTwBOj5w9tRcYyS5H67YJuFiXsGqwZWd7ON4g,10468
34
35
  sonusai/mixture/class_count.py,sha256=_wFnVl2yEOnbor7pLg7cYOUeX6nioov-03Cv3SEbh2k,996
35
36
  sonusai/mixture/config.py,sha256=d2IzZ1samHWGMpoKzSmUwMyAWWhgmyNoxyO8oiUwbsg,22193
36
- sonusai/mixture/constants.py,sha256=xjCskcQi6khqYZDf7j6z1OkeN1C6wE06kBBapcJiNI4,1428
37
- sonusai/mixture/datatypes.py,sha256=mMNxtzyDvAmtuoTHVVJP7jBi6OH-QyC1NfC_ZIiuLlY,8440
37
+ sonusai/mixture/constants.py,sha256=ZRM7Z8e6EwnL9RwaMVxks-QokN9KMWxnQzAf9VNxa9M,1408
38
+ sonusai/mixture/datatypes.py,sha256=uVJtT2sVGS531pSglhaLLF5hZcI3_0oKQOWmMkrCwqo,8253
38
39
  sonusai/mixture/eq_rule_is_valid.py,sha256=MpQwRA5M76wSiQWEI1lW2cLFdPaMttBLcQp3tWD8efM,1243
39
40
  sonusai/mixture/feature.py,sha256=Rwuf82IoXzhHPGbKYVGcatImF_ssBf_FfvbqghVPXtg,4116
40
- sonusai/mixture/generation.py,sha256=miUrc3QOSUNIG6mDkiMCZ6M2ulivUZxlYUAJUOVomWc,39039
41
- sonusai/mixture/helpers.py,sha256=GSGSD2KnvOeEIB6IwNTxyaQNjghTSBMB729kUEd_RiM,22403
41
+ sonusai/mixture/generation.py,sha256=H_9kgresvNAEI6pmqAEwOaxQMPg6geVw6G0Y8fmbu_o,42580
42
+ sonusai/mixture/helpers.py,sha256=4gb_t65Aw9O8hUgDzq-7D_t6LynfQDl1jcuFhQIAdWI,24590
42
43
  sonusai/mixture/log_duration_and_sizes.py,sha256=baTUpqyM15wA125jo9E3posmVJUe3WlpksyO6v9Jul0,1347
43
- sonusai/mixture/mapped_snr_f.py,sha256=mlbYM1t14OXe_Zg4CjpWTuA_Zun4W0O3bSUXeodRBQs,1845
44
- sonusai/mixture/mixdb.py,sha256=PvLeEOLn2n0EfBRe7GuvUQfOmj3SKOrzjUimw2qRHP8,49792
44
+ sonusai/mixture/mapped_snr_f.py,sha256=Fdf2uw62FvyKvVy5VywaUtPZGO1zCWQsHlte0bwkKPQ,3121
45
+ sonusai/mixture/mixdb.py,sha256=xgHD43OgXaFKb1o4nsCPt-bd_mnk-PPetX4OGCGZ8DM,51582
45
46
  sonusai/mixture/soundfile_audio.py,sha256=mHa5SIXsu_uE0j3DO52GydRJrvWSzU_nII-7YJfQ6Qo,4154
46
47
  sonusai/mixture/sox_audio.py,sha256=HT3kYA9TP5QPCuoOJdUMnGVN-qY6q96DGL8zxuog76o,12277
47
- sonusai/mixture/sox_augmentation.py,sha256=F9tBdNvX2guCn7gRppAFrxRnBtjw9q6qAq2_v_A4hh0,4490
48
- sonusai/mixture/speaker_metadata.py,sha256=l98avdxLYUsSDZ88xUjfvHnACkbnD0_Dg1aBGDbzS9I,1380
48
+ sonusai/mixture/sox_augmentation.py,sha256=kBWPrsFk0EBi71nLcKt5v0GA34bY7g9D9x0cEamNWbU,4564
49
49
  sonusai/mixture/spectral_mask.py,sha256=8AkCwhy-PSdP1Uri9miKZP-bXFYnFcH_c9xZCGrHavU,2071
50
50
  sonusai/mixture/target_class_balancing.py,sha256=NTNiKZH0_PWLooeow0l41CjJKK8ZTMVbUqz9ZkaNtWk,4900
51
51
  sonusai/mixture/targets.py,sha256=wyy5vhLhuN-hqBMBGoziVvEJg3FKFvJFgmEE7_LaV2M,7908
52
- sonusai/mixture/tokenized_shell_vars.py,sha256=gCxw8SQUcal6mqWKF7hOBTgSQmbJUk1nT0Gn3H8GA0U,4705
52
+ sonusai/mixture/tokenized_shell_vars.py,sha256=zIAFvwP2WSvkMAGY7f3SJ4KLXI6IBT-U_e9ptnoo5Hc,4803
53
53
  sonusai/mixture/torchaudio_audio.py,sha256=KhHeOMsjmbwOaAcoKD61aFvYBYSlA8OysfT5iGn45MA,3010
54
54
  sonusai/mixture/torchaudio_augmentation.py,sha256=1vEDHI0caL1vrgoY2lAWe4CiHE2jKRuKKH7x23GHw0w,4390
55
55
  sonusai/mixture/truth.py,sha256=Y41pZ52Xkols9LUler0NlgnilUOscBIucmw4GcxXNzU,1612
@@ -61,34 +61,28 @@ sonusai/mixture/truth_functions/file.py,sha256=jOJuC_3y9BH6GGOp9eKcbVrHLVRzUA80B
61
61
  sonusai/mixture/truth_functions/phoneme.py,sha256=stYdlPuNytQK_LLT61OJLfYSqKd-sDjQZdtJKGzt5wA,479
62
62
  sonusai/mixture/truth_functions/sed.py,sha256=8cHjEFjZaH_0hIOHhPmj4AJz2GpEADM6Ys2x4NoiWSY,2469
63
63
  sonusai/mixture/truth_functions/target.py,sha256=KAsjugDRooOA5BRcHVAbZRgV7l8S5CFg7CZ0XtKZaQ0,5764
64
- sonusai/mkmanifest.py,sha256=imI8swwPYVzumrUYEL-9JLvun-ez98PtlUBj2b729k8,8682
65
- sonusai/mkwav.py,sha256=kLfC2ZuF-t8P97nqYw2falTZpymxAeXv0YTJCe6nK10,5356
66
- sonusai/onnx_predict.py,sha256=ZhicNEbjxm34edIrUcmuvKkV3NRFQk4LBn1LUCFdPjg,8733
64
+ sonusai/mkwav.py,sha256=zfSyIiQTIK3KV9Ij33jkLhhZIMVYqaROcRQ4S7c4sIo,5364
65
+ sonusai/onnx_predict.py,sha256=jSxhD2oFyGSTHOGCXbW4fRT-k4SqKOboK2JaDO-yWcs,8737
67
66
  sonusai/plot.py,sha256=ERkmxMM3qjcCDm4LGDQY4fRAncCYAzP7uW8iZ7_brcg,17105
68
- sonusai/post_spenh_targetf.py,sha256=xOz5T6WZuyTHmfbtILIY9skgH064Wvi2GF2Bo5L3YMU,4998
67
+ sonusai/post_spenh_targetf.py,sha256=pHaJZtms7aj4r6sgqQnEGVi6Gg8H_V29szigogV1vZ8,5002
69
68
  sonusai/queries/__init__.py,sha256=oKY5JeqZ4Cz7DwCwPc1_ydB8bUs6KaMcWFp_w02TjOs,255
70
69
  sonusai/queries/queries.py,sha256=oV-m9uiLZOwYTK-Wo7Gf8dpGisaoGf6uDsAJAarVqZI,7553
71
70
  sonusai/speech/__init__.py,sha256=SuPcU_K9wQISsZRIzsRNLtEC6cb616l-Jlx3PU-HWMs,113
72
71
  sonusai/speech/l2arctic.py,sha256=28TT3CohvPu98YNUb8O7rWHAYgPGwYTOLSdfNQjOuyc,3736
73
72
  sonusai/speech/librispeech.py,sha256=A0IpamojCPXyJiHcjCtI7yNWdMjB00cbggjHslssrg8,3120
74
73
  sonusai/speech/mcgill.py,sha256=jcddj64fLdV3sO6CJNafm3w-2SnYoyQtU90odXhdaaE,1976
75
- sonusai/speech/textgrid.py,sha256=8hB6SdEEXxo6JXVFq8mJ1-ilRbBiRXhaHTQjA-HWg-0,3385
74
+ sonusai/speech/textgrid.py,sha256=MVgpZhoL_ZXffqzqgC4N_EVmVpYP3FaOaNP9iTDUsUU,2722
76
75
  sonusai/speech/timit.py,sha256=1vWgj6isD3ATOjMJSTjOPLmDkYyB65M5MwYipEmLEvg,4081
77
76
  sonusai/speech/types.py,sha256=4eKVPAktpkIrZ2qoVp2iT45zxTVNocQEGT6O_Zlub_w,214
78
77
  sonusai/speech/vctk.py,sha256=EAMEBAzjZUI6dw15n-yI2oCN-H4tzM9t4aUVlOxpAbo,1540
79
- sonusai/speech/voxceleb2.py,sha256=-u0mtxFm4chFipLgMGZXR5EBDtYTCQoU1_j_wYTGwPY,2158
78
+ sonusai/speech/voxceleb.py,sha256=aJGN0yDb2LFLmCKmRzmUEjpZWQ-QGWw6XWOpy9967AI,2686
80
79
  sonusai/summarize_metric_spenh.py,sha256=OiZe_bhCq5esXNhsOkHDD7g4ssYrpENDHvDVoPzV9iw,1822
81
80
  sonusai/tplot.py,sha256=85T6OPZfxVegHBiSuilFpdgCNMEE0VKAuciNy4rCY5Y,14544
82
- sonusai/utils/__init__.py,sha256=y2Xe72QMNk8LbbjdOUOHiR5eVg32fYrFhinWSuSHi-w,2248
81
+ sonusai/utils/__init__.py,sha256=h7QrOyEBMUMoIBFKZpNwDG8Jg-1uw3bs-qflB3CXxhU,2257
83
82
  sonusai/utils/asl_p56.py,sha256=-bvQpd-jRQVURbkZJpRoyEAq6gTv9Rc3oFDbh5_lcjY,3861
84
83
  sonusai/utils/asr.py,sha256=6y6VYJizHpuQ3MgKbEQ4t2gofO-MW6Ez23oAd6d23IE,2920
85
84
  sonusai/utils/asr_functions/__init__.py,sha256=JyHK67s97bw7QzrlkboWhws4yNytdPatqzLJxfwx-yw,43
86
- sonusai/utils/asr_functions/aaware_whisper.py,sha256=LzO9CZV0wBWkjmCR2nSWN_AW9UJwriAsC1OYSlfVeT8,1981
87
- sonusai/utils/asr_manifest_functions/__init__.py,sha256=jfi9xC5c86F_aMSsI5Xj-pxWGxuQ7fwZ8Wdf4T7kDsA,343
88
- sonusai/utils/asr_manifest_functions/data.py,sha256=nO4oT3EQmydwn1pzc-ZM09yz4X2ic-LQuHzGEnJhKe8,32
89
- sonusai/utils/asr_manifest_functions/librispeech.py,sha256=_3tGc8qfAUpYJZ0_avpW0vGp7zjdpeqj1HAgXi3TL4Q,1612
90
- sonusai/utils/asr_manifest_functions/mcgill_speech.py,sha256=dW-5XTC5xOY3PHU2DvlWNWDeoprXDD0Zq2dXDdPAjzE,934
91
- sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py,sha256=9iMrnE-qabLMnyewyxsBMl0uCS8yS7BPJOdmUoOnGAc,2146
85
+ sonusai/utils/asr_functions/aaware_whisper.py,sha256=Ew3zb8OfbxEW7q1s-KA7D5eph4SjVSUAJgiLK-vVqhI,1985
92
86
  sonusai/utils/audio_devices.py,sha256=LgaXTln1oRArBzaet3rZiIO2plgtaThuGBc3sJ_sLlo,1414
93
87
  sonusai/utils/braced_glob.py,sha256=Z_XIpPK17QiP1JbzAnUC5w3oyG8ZovoyM22Wh-Q_vWU,1675
94
88
  sonusai/utils/calculate_input_shape.py,sha256=63ILxibYKuTQozY83QN8Y2OOhBEbW_1X47Q0askcHDM,984
@@ -118,10 +112,10 @@ sonusai/utils/reshape.py,sha256=E8Eu6grynaeWwVO6peIR0BF22SrVaJSa1Rkl109lq6Y,5997
118
112
  sonusai/utils/seconds_to_hms.py,sha256=oxLuZhTJJr9swj-fOSOrZJ5vBNM7_BrOMQhX1pYpiv0,260
119
113
  sonusai/utils/stacked_complex.py,sha256=feLhz3GC1ILxBGMHOj3sJK--sidsXKbfwkalwAVwizc,2950
120
114
  sonusai/utils/stratified_shuffle_split.py,sha256=rJNXvBp-GxoKzH3OpL7k0ANSu5xMP2zJ7K1fm_33UzE,7022
121
- sonusai/utils/wave.py,sha256=O4ZXkZ6wjrKGa99wBCdFd8G6bp91MXXDnmGihpaEMh0,856
115
+ sonusai/utils/write_audio.py,sha256=ZsPGExwM86QHLLN2LOWekK2uAqf5pV_1oRW811p0QAI,840
122
116
  sonusai/utils/yes_or_no.py,sha256=eMLXBVH0cEahiXY4W2KNORmwNQ-ba10eRtldh0y4NYg,263
123
117
  sonusai/vars.py,sha256=m2AefF0m5bXWGXpJj8Pi42zWL2ydeEj7bkak3GrtMyM,940
124
- sonusai-0.17.2.dist-info/METADATA,sha256=eZmrmMohaVLBAz3v2lGdBcwGCjnszgDiKcAHI9i_2YE,2483
125
- sonusai-0.17.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
126
- sonusai-0.17.2.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
127
- sonusai-0.17.2.dist-info/RECORD,,
118
+ sonusai-0.18.0.dist-info/METADATA,sha256=LwOGcp1V_87ef8oyjI8Kjwo5K8tbIKjpRfJ17dW0Dbc,2591
119
+ sonusai-0.18.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
120
+ sonusai-0.18.0.dist-info/entry_points.txt,sha256=zMNjEphEPO6B3cD1GNpit7z-yA9tUU5-j3W2v-UWstU,92
121
+ sonusai-0.18.0.dist-info/RECORD,,
@@ -1,35 +0,0 @@
1
- from functools import cached_property
2
- from pathlib import Path
3
-
4
- from praatio import textgrid
5
- from praatio.data_classes.textgrid_tier import TextgridTier
6
- from praatio.utilities.constants import Interval
7
-
8
- from sonusai.mixture.datatypes import TargetFiles
9
- from sonusai.mixture.tokenized_shell_vars import tokenized_expand
10
-
11
-
12
- class SpeakerMetadata:
13
- def __init__(self, target_files: TargetFiles) -> None:
14
- self.data: dict[str, dict[str, TextgridTier]] = {}
15
- for file in target_files:
16
- self.data[file.name] = {}
17
- file_name, _ = tokenized_expand(file.name)
18
- tg_file = Path(file_name).with_suffix('.TextGrid')
19
- if tg_file.exists():
20
- tg = textgrid.openTextgrid(str(tg_file), includeEmptyIntervals=False)
21
- for tier in tg.tierNames:
22
- self.data[file.name][tier] = tg.getTier(tier)
23
-
24
- @cached_property
25
- def tiers(self) -> list[str]:
26
- return sorted(list(set([key for value in self.data.values() for key in value.keys()])))
27
-
28
- def all(self, tier: str, label_only: bool = False) -> list[Interval]:
29
- results = [value[tier].entries for value in self.data.values()]
30
- if label_only:
31
- return sorted(set([r.label for result in results for r in result]))
32
- return results
33
-
34
- def mixids_for(self, tier: str, value: str) -> list[int]:
35
- pass
sonusai/mkmanifest.py DELETED
@@ -1,209 +0,0 @@
1
- """mkmanifest
2
-
3
- usage: mkmanifest [-hvn] [--include GLOB] [-m METHOD] [-e ADAT] [-o OUTPUT] PATH ...
4
-
5
- options:
6
- -h, --help
7
- -v, --verbose Be verbose: list all files found.
8
- -n, --dry-run Collect files, but exit without processing and writing manifest file.
9
- --include GLOB Search only files whose base name matches GLOB. [default: *.{wav,flac}].
10
- -m METHOD, --method METHOD Method for getting the true speech text of the audio files. [default: librispeech].
11
- -e ADAT, --audio-env ADAT Environment variable pointing to all audio data.
12
- -o OUTPUT, --output OUTPUT Output file name. [default: asr_manifest.json].
13
-
14
- Make a speech recognition (ASR) .json manifest file of all audio files under PATHS following the NVIDIA NeMo format.
15
- An example of manifest entries:
16
-
17
- {"audio_filepath": "<absolute_path_to>/1355-39947-0000.wav", "duration": 11.3, "text": "psychotherapy ..."}
18
- {"audio_filepath": "<absolute_path_to>/1355-39947-0001.wav", "duration": 15.905, "text": "it is an ..."}
19
-
20
- See the NVIDIA NeMo docs for more information:
21
- https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/datasets.html
22
-
23
- Inputs:
24
- PATH A relative path name or list of paths containing audio files. Each will be
25
- recursively searched for files matching the pattern GLOB.
26
- GLOB Match the pattern GLOB using wildcard matching.
27
- Example: '*.{wav,flac}' matches all .wav and .flac files.
28
- METHOD The method to use for fetching the true speech of the audio files.
29
- Supported methods:
30
- - 'librispeech'
31
- - 'vctk_noisy_speech' expects subdirs named like <name>_wav/ and <name>_txt/ with files in
32
- each using same basename, but with .wav and .txt respectively.
33
- - 'mcgill-speech' expects audio data in basename/speakerid/speakerid-promptid.wav and
34
- transcript data in Scripts/HarvardLists.dat
35
- ADAT Audio data environment variable. All found files will be expanded to their full, absolute path and
36
- then parts of the path that match the specified environment variable value will be replaced with
37
- the variable. This accommodates portability across platforms where the sound datasets may in
38
- different locations.
39
- OUTPUT Name of output file. Default is asr_manifest.json.
40
-
41
- Outputs the following to the current directory:
42
- <OUTPUT>
43
- mkmanifest.log
44
-
45
- Example usage for LibriSpeech:
46
- sonusai mkmanifest -mlibrispeech -eADAT -oasr_manifest.json --include='*.flac' train-clean-100
47
- sonusai mkmanifest -m mcgill-speech -e ADAT -o asr_manifest_16k.json 16k-LP7/
48
- """
49
- import signal
50
-
51
-
52
- def signal_handler(_sig, _frame):
53
- import sys
54
-
55
- from sonusai import logger
56
-
57
- logger.info('Canceled due to keyboard interrupt')
58
- sys.exit(1)
59
-
60
-
61
- signal.signal(signal.SIGINT, signal_handler)
62
-
63
- VALID_METHOD = ['librispeech', 'vctk_noisy_speech', 'mcgill-speech']
64
-
65
-
66
- def main() -> None:
67
- from docopt import docopt
68
-
69
- import sonusai
70
- from sonusai.utils import trim_docstring
71
-
72
- args = docopt(trim_docstring(__doc__), version=sonusai.__version__, options_first=True)
73
-
74
- verbose = args['--verbose']
75
- dry_run = args['--dry-run']
76
- include = args['--include']
77
- method = args['--method']
78
- audio_env = args['--audio-env']
79
- output = args['--output']
80
- paths = args['PATH']
81
-
82
- import json
83
- from functools import partial
84
- import time
85
- from os import environ
86
- from os.path import abspath
87
- from os.path import join
88
- from os.path import realpath
89
-
90
- from tqdm import tqdm
91
-
92
- from sonusai import SonusAIError
93
- from sonusai import create_file_handler
94
- from sonusai import initial_log_messages
95
- from sonusai import logger
96
- from sonusai import update_console_handler
97
- from sonusai.utils import PathInfo
98
- from sonusai.utils import braced_iglob
99
- from sonusai.utils import pp_tqdm_imap
100
- from sonusai.utils import seconds_to_hms
101
- from sonusai.utils.asr_manifest_functions import collect_librispeech_transcripts
102
- from sonusai.utils.asr_manifest_functions import collect_vctk_noisy_speech_transcripts
103
- from sonusai.utils.asr_manifest_functions import get_librispeech_manifest_entry
104
- from sonusai.utils.asr_manifest_functions import get_vctk_noisy_speech_manifest_entry
105
- from sonusai.utils.asr_manifest_functions import get_mcgill_speech_manifest_entry
106
-
107
- start_time = time.monotonic()
108
-
109
- create_file_handler('mkmanifest.log')
110
- update_console_handler(verbose)
111
- initial_log_messages('mkmanifest')
112
-
113
- if method not in VALID_METHOD:
114
- raise SonusAIError(f'Unknown method: {method}')
115
-
116
- audio_dir = None
117
- if audio_env is not None:
118
- audio_dir = realpath(environ[audio_env])
119
- if audio_dir is None:
120
- raise SonusAIError(f'Unknown environment variable: {audio_env}')
121
-
122
- if audio_env:
123
- for p in paths:
124
- if not realpath(abspath(p)).startswith(audio_dir):
125
- logger.warning(f'Specified directory, {p}, is not part of the provided audio environment: '
126
- f'${audio_env}={audio_dir}')
127
-
128
- logger.info('')
129
- logger.info(f'Searching {len(paths)} provided director{"ies" if len(paths) > 1 else "y"}...')
130
-
131
- entries: list[PathInfo] = []
132
- for p in paths:
133
- location = join(realpath(abspath(p)), '**', include)
134
- logger.debug(f'Processing {location}')
135
- for file in braced_iglob(pathname=location, recursive=True):
136
- name = file
137
- if audio_env is not None:
138
- name = name.replace(audio_dir, f'${audio_env}')
139
- entries.append(PathInfo(abs_path=file, audio_filepath=name))
140
- logger.debug('')
141
-
142
- logger.info(f'Found {len(entries)} audio file{"s" if len(entries) != 1 else ""}')
143
-
144
- if dry_run:
145
- logger.info('')
146
- logger.info('Dry run')
147
- logger.info('')
148
- for entry in entries:
149
- logger.info(f' - {entry.audio_filepath}')
150
- return
151
-
152
- if method == 'librispeech':
153
- logger.info('Collecting LibriSpeech transcript data')
154
- transcript_data = collect_librispeech_transcripts(paths=paths)
155
-
156
- processing_func = partial(get_librispeech_manifest_entry, transcript_data=transcript_data)
157
- progress = tqdm(total=len(entries), desc='Creating LibriSpeech manifest data')
158
- results = pp_tqdm_imap(processing_func, entries, progress=progress)
159
- progress.close()
160
-
161
- with open(output, 'w') as f:
162
- for result in results:
163
- f.write(json.dumps(result) + '\n')
164
-
165
- if method == 'vctk_noisy_speech':
166
- logger.info('Collecting VCTK Noisy Speech transcript data')
167
- transcript_data = collect_vctk_noisy_speech_transcripts(paths=paths)
168
-
169
- processing_func = partial(get_vctk_noisy_speech_manifest_entry, transcript_data=transcript_data)
170
- progress = tqdm(total=len(entries), desc='Creating VCTK Noisy Speech manifest data')
171
- results = pp_tqdm_imap(processing_func, entries, progress=progress)
172
- progress.close()
173
-
174
- with open(output, 'w') as f:
175
- for result in results:
176
- f.write(json.dumps(result) + '\n')
177
-
178
- if method == 'mcgill-speech':
179
- logger.info(f'Found {len(entries)} Mcgill Speech files, opening prompt file ...')
180
- # Note expecting only one path pointing to data subdir
181
- if len(paths) != 1:
182
- raise SonusAIError(f'mcgill-speech only support a single path')
183
- prompt_fpath = join(join(realpath(abspath(paths[0]))), '../Scripts/HarvardList.dat')
184
- with open(prompt_fpath, encoding='utf-8') as f:
185
- lines = f.readlines()
186
-
187
- logger.info(f'Found {len(lines) - 4} entries in prompt file.')
188
- # First 4 lines are header stuff, can use remaining directly with simple lookup
189
- # example line: '01_02:Glue the sheet ...\n' (paragraph 1, sentence 2)
190
- # 11 entries per group, so getting line is 11*(p1-1)+(s2-1)
191
- lines = lines[4:]
192
-
193
- processing_func = partial(get_mcgill_speech_manifest_entry, transcript_data=lines)
194
- progress = tqdm(total=len(entries), desc='Creating Mcgill Speech manifest data')
195
- results = pp_tqdm_imap(processing_func, entries, progress=progress)
196
- progress.close()
197
-
198
- with open(output, 'w') as f:
199
- for result in results:
200
- f.write(json.dumps(result) + '\n')
201
-
202
- end_time = time.monotonic()
203
- logger.info('')
204
- logger.info(f'Completed in {seconds_to_hms(seconds=end_time - start_time)}')
205
- logger.info('')
206
-
207
-
208
- if __name__ == '__main__':
209
- main()
@@ -1,6 +0,0 @@
1
- from .data import TranscriptData
2
- from .librispeech import collect_librispeech_transcripts
3
- from .librispeech import get_librispeech_manifest_entry
4
- from .vctk_noisy_speech import collect_vctk_noisy_speech_transcripts
5
- from .vctk_noisy_speech import get_vctk_noisy_speech_manifest_entry
6
- from .mcgill_speech import get_mcgill_speech_manifest_entry
@@ -1 +0,0 @@
1
- TranscriptData = dict[str, str]
@@ -1,46 +0,0 @@
1
- from sonusai.utils import PathInfo
2
- from sonusai.utils.asr_manifest_functions import TranscriptData
3
-
4
-
5
- def collect_librispeech_transcripts(paths: list[str] | str) -> TranscriptData:
6
- from glob import iglob
7
- from os.path import abspath
8
- from os.path import dirname
9
- from os.path import join
10
-
11
- from sonusai import SonusAIError
12
-
13
- entries: TranscriptData = {}
14
- if not isinstance(paths, list):
15
- paths = [paths]
16
-
17
- for p in paths:
18
- location = join(abspath(p), '**', '*.trans.txt')
19
- for file in iglob(pathname=location, recursive=True):
20
- root = dirname(file)
21
- with open(file, encoding='utf-8') as f:
22
- for line in f:
23
- name, text = line[: line.index(' ')], line[line.index(' ') + 1:]
24
- name = join(root, name)
25
- if name in entries:
26
- raise SonusAIError(f'{name} already exists in transcript data')
27
- entries[name] = text.lower().strip()
28
- return entries
29
-
30
-
31
- def get_librispeech_manifest_entry(entry: PathInfo, transcript_data: TranscriptData) -> dict:
32
- from os.path import splitext
33
- from subprocess import check_output
34
-
35
- from sonusai import SonusAIError
36
-
37
- name = splitext(entry.abs_path)[0]
38
- duration = float(check_output(f'soxi -D {entry.abs_path}', shell=True))
39
- if name not in transcript_data.keys():
40
- raise SonusAIError(f'Could not find {name} in transcript data')
41
-
42
- return {
43
- 'audio_filepath': entry.audio_filepath,
44
- 'text': transcript_data[name],
45
- 'duration': duration,
46
- }
@@ -1,29 +0,0 @@
1
- from sonusai.utils import PathInfo
2
-
3
-
4
- def get_mcgill_speech_manifest_entry(entry: PathInfo, transcript_data: list[str]) -> dict:
5
- from os.path import splitext
6
- from os.path import basename
7
- from subprocess import check_output
8
-
9
- from sonusai import SonusAIError
10
-
11
- name = splitext(entry.abs_path)[0]
12
- duration = float(check_output(f'soxi -D {entry.abs_path}', shell=True))
13
- # i.e., from MA01_02.wav, get 01_02
14
- promptname = basename(name)[2:]
15
- # paragraph num
16
- pnum = int(promptname[0:2])
17
- snum = int(promptname[3:5])
18
- idx = 11 * (pnum - 1) + (snum - 1)
19
- try:
20
- # remove prompt-id prefix and \n suffix
21
- text = transcript_data[idx][6:-1]
22
- except IndexError:
23
- raise SonusAIError(f'Could not find {promptname}, idx {idx} in transcript data')
24
-
25
- return {
26
- 'audio_filepath': entry.audio_filepath,
27
- 'text': text,
28
- 'duration': duration,
29
- }