sonusai 0.17.0__py3-none-any.whl → 0.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonusai/audiofe.py +25 -54
- sonusai/calc_metric_spenh.py +212 -219
- sonusai/doc/doc.py +1 -1
- sonusai/mixture/__init__.py +2 -0
- sonusai/mixture/audio.py +12 -0
- sonusai/mixture/datatypes.py +11 -3
- sonusai/mixture/mixdb.py +100 -0
- sonusai/mixture/soundfile_audio.py +39 -0
- sonusai/mixture/sox_augmentation.py +3 -0
- sonusai/mixture/speaker_metadata.py +35 -0
- sonusai/mixture/torchaudio_audio.py +22 -0
- sonusai/mkmanifest.py +1 -1
- sonusai/mkwav.py +4 -4
- sonusai/onnx_predict.py +114 -410
- sonusai/post_spenh_targetf.py +2 -2
- sonusai/queries/queries.py +1 -1
- sonusai/speech/__init__.py +3 -0
- sonusai/speech/l2arctic.py +116 -0
- sonusai/speech/librispeech.py +99 -0
- sonusai/speech/mcgill.py +70 -0
- sonusai/speech/textgrid.py +100 -0
- sonusai/speech/timit.py +135 -0
- sonusai/speech/types.py +12 -0
- sonusai/speech/vctk.py +52 -0
- sonusai/speech/voxceleb.py +102 -0
- sonusai/utils/__init__.py +3 -2
- sonusai/utils/asr_functions/aaware_whisper.py +2 -2
- sonusai/utils/asr_manifest_functions/__init__.py +0 -1
- sonusai/utils/asr_manifest_functions/data.py +0 -8
- sonusai/utils/asr_manifest_functions/librispeech.py +1 -1
- sonusai/utils/asr_manifest_functions/mcgill_speech.py +1 -1
- sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +1 -1
- sonusai/utils/braced_glob.py +7 -3
- sonusai/utils/onnx_utils.py +110 -106
- sonusai/utils/path_info.py +7 -0
- sonusai/utils/{wave.py → write_audio.py} +2 -2
- {sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/METADATA +3 -1
- {sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/RECORD +40 -35
- {sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/WHEEL +1 -1
- sonusai/calc_metric_spenh-save.py +0 -1334
- sonusai/onnx_predict-old.py +0 -240
- sonusai/onnx_predict-save.py +0 -487
- sonusai/ovino_predict.py +0 -508
- sonusai/ovino_query_devices.py +0 -47
- sonusai/torchl_onnx-old.py +0 -216
- {sonusai-0.17.0.dist-info → sonusai-0.17.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,116 @@
|
|
1
|
+
import os
|
2
|
+
import string
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from .types import TimeAlignedType
|
7
|
+
|
8
|
+
|
9
|
+
def _get_duration(name: str) -> float:
|
10
|
+
import soundfile
|
11
|
+
|
12
|
+
from sonusai import SonusAIError
|
13
|
+
|
14
|
+
try:
|
15
|
+
return soundfile.info(name).duration
|
16
|
+
except Exception as e:
|
17
|
+
raise SonusAIError(f'Error reading {name}: {e}')
|
18
|
+
|
19
|
+
|
20
|
+
def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
|
21
|
+
"""Load time-aligned text data given a L2-ARCTIC audio file.
|
22
|
+
|
23
|
+
:param audio: Path to the L2-ARCTIC audio file.
|
24
|
+
:return: A TimeAlignedType object.
|
25
|
+
"""
|
26
|
+
file = Path(audio).parent.parent / 'transcript' / (Path(audio).stem + '.txt')
|
27
|
+
if not os.path.exists(file):
|
28
|
+
return None
|
29
|
+
|
30
|
+
with open(file, mode='r', encoding='utf-8') as f:
|
31
|
+
line = f.read()
|
32
|
+
|
33
|
+
return TimeAlignedType(0,
|
34
|
+
_get_duration(str(audio)),
|
35
|
+
line.strip().lower().translate(str.maketrans('', '', string.punctuation)))
|
36
|
+
|
37
|
+
|
38
|
+
def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
|
39
|
+
"""Load time-aligned word data given a L2-ARCTIC audio file.
|
40
|
+
|
41
|
+
:param audio: Path to the L2-ARCTIC audio file.
|
42
|
+
:return: A list of TimeAlignedType objects.
|
43
|
+
"""
|
44
|
+
return _load_ta(audio, 'words')
|
45
|
+
|
46
|
+
|
47
|
+
def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
|
48
|
+
"""Load time-aligned phonemes data given a L2-ARCTIC audio file.
|
49
|
+
|
50
|
+
:param audio: Path to the L2-ARCTIC audio file.
|
51
|
+
:return: A list of TimeAlignedType objects.
|
52
|
+
"""
|
53
|
+
return _load_ta(audio, 'phones')
|
54
|
+
|
55
|
+
|
56
|
+
def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
|
57
|
+
from praatio import textgrid
|
58
|
+
|
59
|
+
file = Path(audio).parent.parent / 'textgrid' / (Path(audio).stem + '.TextGrid')
|
60
|
+
if not os.path.exists(file):
|
61
|
+
return None
|
62
|
+
|
63
|
+
tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
|
64
|
+
if tier not in tg.tierNames:
|
65
|
+
return None
|
66
|
+
|
67
|
+
entries: list[TimeAlignedType] = []
|
68
|
+
for entry in tg.getTier(tier).entries:
|
69
|
+
entries.append(TimeAlignedType(text=entry.label, start=entry.start, end=entry.end))
|
70
|
+
|
71
|
+
return entries
|
72
|
+
|
73
|
+
|
74
|
+
def load_annotations(audio: str | os.PathLike[str]) -> Optional[dict[str, list[TimeAlignedType]]]:
|
75
|
+
"""Load time-aligned annotation data given a L2-ARCTIC audio file.
|
76
|
+
|
77
|
+
:param audio: Path to the L2-ARCTIC audio file.
|
78
|
+
:return: A dictionary of a list of TimeAlignedType objects.
|
79
|
+
"""
|
80
|
+
from praatio import textgrid
|
81
|
+
|
82
|
+
file = Path(audio).parent.parent / 'annotation' / (Path(audio).stem + '.TextGrid')
|
83
|
+
if not os.path.exists(file):
|
84
|
+
return None
|
85
|
+
|
86
|
+
tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
|
87
|
+
result: dict[str, list[TimeAlignedType]] = {}
|
88
|
+
for tier in tg.tierNames:
|
89
|
+
entries: list[TimeAlignedType] = []
|
90
|
+
for entry in tg.getTier(tier).entries:
|
91
|
+
entries.append(TimeAlignedType(text=entry.label, start=entry.start, end=entry.end))
|
92
|
+
result[tier] = entries
|
93
|
+
|
94
|
+
return result
|
95
|
+
|
96
|
+
|
97
|
+
def load_speakers(input_dir: Path) -> dict:
|
98
|
+
speakers = {}
|
99
|
+
with open(input_dir / 'readme-download.txt') as file:
|
100
|
+
processing = False
|
101
|
+
for line in file:
|
102
|
+
if not processing and line.startswith('|---|'):
|
103
|
+
processing = True
|
104
|
+
continue
|
105
|
+
|
106
|
+
if processing:
|
107
|
+
if line.startswith('|**Total**|'):
|
108
|
+
break
|
109
|
+
else:
|
110
|
+
fields = line.strip().split('|')
|
111
|
+
speaker_id = fields[1]
|
112
|
+
gender = fields[2]
|
113
|
+
dialect = fields[3]
|
114
|
+
speakers[speaker_id] = {'gender': gender, 'dialect': dialect}
|
115
|
+
|
116
|
+
return speakers
|
@@ -0,0 +1,99 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .types import TimeAlignedType
|
6
|
+
|
7
|
+
|
8
|
+
def _get_num_samples(audio: str | os.PathLike[str]) -> int:
|
9
|
+
"""Get number of samples from audio file using soundfile
|
10
|
+
|
11
|
+
:param audio: Audio file name
|
12
|
+
:return: Number of samples
|
13
|
+
"""
|
14
|
+
import soundfile
|
15
|
+
from pydub import AudioSegment
|
16
|
+
|
17
|
+
if Path(audio).suffix == '.mp3':
|
18
|
+
return AudioSegment.from_mp3(audio).frame_count()
|
19
|
+
|
20
|
+
if Path(audio).suffix == '.m4a':
|
21
|
+
return AudioSegment.from_file(audio).frame_count()
|
22
|
+
|
23
|
+
return soundfile.info(audio).frames
|
24
|
+
|
25
|
+
|
26
|
+
def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
|
27
|
+
"""Load text data from a LibriSpeech transcription file given a LibriSpeech audio filename.
|
28
|
+
|
29
|
+
:param audio: Path to the LibriSpeech audio file.
|
30
|
+
:return: A TimeAlignedType object.
|
31
|
+
"""
|
32
|
+
import string
|
33
|
+
|
34
|
+
from sonusai.mixture import get_sample_rate
|
35
|
+
|
36
|
+
path = Path(audio)
|
37
|
+
name = path.stem
|
38
|
+
transcript_filename = path.parent / f'{path.parent.parent.name}-{path.parent.name}.trans.txt'
|
39
|
+
|
40
|
+
if not os.path.exists(transcript_filename):
|
41
|
+
return None
|
42
|
+
|
43
|
+
with open(transcript_filename, mode='r', encoding='utf-8') as f:
|
44
|
+
for line in f.readlines():
|
45
|
+
fields = line.strip().split()
|
46
|
+
key = fields[0]
|
47
|
+
if key == name:
|
48
|
+
text = ' '.join(fields[1:]).lower().translate(str.maketrans('', '', string.punctuation))
|
49
|
+
return TimeAlignedType(0, _get_num_samples(audio) / get_sample_rate(str(audio)), text)
|
50
|
+
|
51
|
+
return None
|
52
|
+
|
53
|
+
|
54
|
+
def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
|
55
|
+
"""Load time-aligned word data given a LibriSpeech audio file.
|
56
|
+
|
57
|
+
:param audio: Path to the Librispeech audio file.
|
58
|
+
:return: A list of TimeAlignedType objects.
|
59
|
+
"""
|
60
|
+
return _load_ta(audio, 'words')
|
61
|
+
|
62
|
+
|
63
|
+
def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
|
64
|
+
"""Load time-aligned phonemes data given a LibriSpeech audio file.
|
65
|
+
|
66
|
+
:param audio: Path to the LibriSpeech audio file.
|
67
|
+
:return: A list of TimeAlignedType objects.
|
68
|
+
"""
|
69
|
+
return _load_ta(audio, 'phones')
|
70
|
+
|
71
|
+
|
72
|
+
def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
|
73
|
+
from praatio import textgrid
|
74
|
+
|
75
|
+
file = Path(audio).with_suffix('.TextGrid')
|
76
|
+
if not os.path.exists(file):
|
77
|
+
return None
|
78
|
+
|
79
|
+
tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
|
80
|
+
if tier not in tg.tierNames:
|
81
|
+
return None
|
82
|
+
|
83
|
+
entries: list[TimeAlignedType] = []
|
84
|
+
for entry in tg.getTier(tier).entries:
|
85
|
+
entries.append(TimeAlignedType(text=entry.label, start=entry.start, end=entry.end))
|
86
|
+
|
87
|
+
return entries
|
88
|
+
|
89
|
+
|
90
|
+
def load_speakers(input_dir: Path) -> dict:
|
91
|
+
speakers = {}
|
92
|
+
with open(input_dir / 'SPEAKERS.TXT') as file:
|
93
|
+
for line in file:
|
94
|
+
if not line.startswith(';'):
|
95
|
+
fields = line.strip().split('|')
|
96
|
+
speaker_id = fields[0].strip()
|
97
|
+
gender = fields[1].strip()
|
98
|
+
speakers[speaker_id] = {'gender': gender}
|
99
|
+
return speakers
|
sonusai/speech/mcgill.py
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from .types import TimeAlignedType
|
5
|
+
|
6
|
+
|
7
|
+
def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
|
8
|
+
"""Load time-aligned text data given a McGill-Speech audio file.
|
9
|
+
|
10
|
+
:param audio: Path to the McGill-Speech audio file.
|
11
|
+
:return: A TimeAlignedType object.
|
12
|
+
"""
|
13
|
+
import string
|
14
|
+
import struct
|
15
|
+
|
16
|
+
from sonusai.mixture import get_sample_rate
|
17
|
+
|
18
|
+
if not os.path.exists(audio):
|
19
|
+
return None
|
20
|
+
|
21
|
+
sample_rate = get_sample_rate(str(audio))
|
22
|
+
|
23
|
+
with open(audio, mode='rb') as f:
|
24
|
+
content = f.read()
|
25
|
+
|
26
|
+
riff_id, file_size, wave_id = struct.unpack('<4si4s', content[:12])
|
27
|
+
if riff_id.decode('utf-8') != 'RIFF':
|
28
|
+
return None
|
29
|
+
|
30
|
+
if wave_id.decode('utf-8') != 'WAVE':
|
31
|
+
return None
|
32
|
+
|
33
|
+
fmt_id, fmt_size = struct.unpack('<4si', content[12:20])
|
34
|
+
|
35
|
+
if fmt_id.decode('utf-8') != 'fmt ':
|
36
|
+
return None
|
37
|
+
|
38
|
+
if fmt_size != 16:
|
39
|
+
return None
|
40
|
+
|
41
|
+
(_wave_format_tag,
|
42
|
+
channels,
|
43
|
+
_samples_per_sec,
|
44
|
+
_avg_bytes_per_sec,
|
45
|
+
_block_align,
|
46
|
+
bits_per_sample) = struct.unpack('<hhiihh', content[20:36])
|
47
|
+
|
48
|
+
i = 36
|
49
|
+
samples = None
|
50
|
+
text = None
|
51
|
+
while i < file_size:
|
52
|
+
chunk_id = struct.unpack('<4s', content[i:i + 4])[0].decode('utf-8')
|
53
|
+
chunk_size = struct.unpack('<i', content[i + 4:i + 8])[0]
|
54
|
+
|
55
|
+
if chunk_id == 'data':
|
56
|
+
samples = chunk_size / channels / (bits_per_sample / 8)
|
57
|
+
break
|
58
|
+
|
59
|
+
if chunk_id == 'afsp':
|
60
|
+
chunks = struct.unpack(f'<{chunk_size}s', content[i + 8:i + 8 + chunk_size])[0]
|
61
|
+
chunks = chunks.decode('utf-8').split('\x00')
|
62
|
+
for chunk in chunks:
|
63
|
+
if chunk.startswith('text: "'):
|
64
|
+
text = chunk[7:-1].lower().translate(str.maketrans('', '', string.punctuation))
|
65
|
+
i += 8 + chunk_size + chunk_size % 2
|
66
|
+
|
67
|
+
if text and samples:
|
68
|
+
return TimeAlignedType(start=0, end=samples / sample_rate, text=text)
|
69
|
+
|
70
|
+
return None
|
@@ -0,0 +1,100 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
from praatio import textgrid
|
4
|
+
from praatio.utilities.constants import Interval
|
5
|
+
|
6
|
+
from .types import TimeAlignedType
|
7
|
+
|
8
|
+
|
9
|
+
def _get_duration(name: str) -> float:
|
10
|
+
from pydub import AudioSegment
|
11
|
+
|
12
|
+
from sonusai import SonusAIError
|
13
|
+
|
14
|
+
try:
|
15
|
+
return AudioSegment.from_file(name).duration_seconds
|
16
|
+
except Exception as e:
|
17
|
+
raise SonusAIError(f'Error reading {name}: {e}')
|
18
|
+
|
19
|
+
|
20
|
+
def create_textgrid(prompt: Path,
|
21
|
+
speaker_id: str,
|
22
|
+
speaker: dict,
|
23
|
+
output_dir: Path,
|
24
|
+
text: TimeAlignedType = None,
|
25
|
+
words: list[TimeAlignedType] = None,
|
26
|
+
phonemes: list[TimeAlignedType] = None) -> None:
|
27
|
+
if text is not None or words is not None or phonemes is not None:
|
28
|
+
min_t, max_t = _get_min_max({'phonemes': phonemes,
|
29
|
+
'text': [text],
|
30
|
+
'words': words})
|
31
|
+
else:
|
32
|
+
min_t = 0
|
33
|
+
max_t = _get_duration(str(prompt))
|
34
|
+
|
35
|
+
tg = textgrid.Textgrid()
|
36
|
+
|
37
|
+
tg.addTier(textgrid.IntervalTier('speaker_id', [Interval(min_t, max_t, speaker_id)], min_t, max_t))
|
38
|
+
for tier in speaker.keys():
|
39
|
+
tg.addTier(textgrid.IntervalTier(tier, [Interval(min_t, max_t, str(speaker[tier]))], min_t, max_t))
|
40
|
+
|
41
|
+
if text is not None:
|
42
|
+
entries = [Interval(text.start, text.end, text.text)]
|
43
|
+
text_tier = textgrid.IntervalTier('text', entries, min_t, max_t)
|
44
|
+
tg.addTier(text_tier)
|
45
|
+
|
46
|
+
if words is not None:
|
47
|
+
entries = []
|
48
|
+
for word in words:
|
49
|
+
entries.append(Interval(word.start, word.end, word.text))
|
50
|
+
words_tier = textgrid.IntervalTier('words', entries, min_t, max_t)
|
51
|
+
tg.addTier(words_tier)
|
52
|
+
|
53
|
+
if phonemes is not None:
|
54
|
+
entries = []
|
55
|
+
for phoneme in phonemes:
|
56
|
+
entries.append(Interval(phoneme.start, phoneme.end, phoneme.text))
|
57
|
+
phonemes_tier = textgrid.IntervalTier('phonemes', entries, min_t, max_t)
|
58
|
+
tg.addTier(phonemes_tier)
|
59
|
+
|
60
|
+
output_filename = str(output_dir / prompt.stem) + '.TextGrid'
|
61
|
+
tg.save(output_filename, format='long_textgrid', includeBlankSpaces=True)
|
62
|
+
|
63
|
+
|
64
|
+
def _get_min_max(tiers: dict[str, list[TimeAlignedType]]) -> tuple[float, float]:
|
65
|
+
starts = []
|
66
|
+
ends = []
|
67
|
+
for tier in tiers.values():
|
68
|
+
if tier is not None:
|
69
|
+
starts.append(tier[0].start)
|
70
|
+
ends.append(tier[-1].end)
|
71
|
+
|
72
|
+
return min(starts), max(ends)
|
73
|
+
|
74
|
+
|
75
|
+
def annotate_textgrid(tiers: dict[str, list[TimeAlignedType]], prompt: Path, output_dir: Path) -> None:
|
76
|
+
import os
|
77
|
+
|
78
|
+
if tiers is None:
|
79
|
+
return
|
80
|
+
|
81
|
+
file = Path(output_dir / prompt.stem).with_suffix('.TextGrid')
|
82
|
+
if not os.path.exists(file):
|
83
|
+
tg = textgrid.Textgrid()
|
84
|
+
min_t, max_t = _get_min_max(tiers)
|
85
|
+
else:
|
86
|
+
tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
|
87
|
+
min_t = tg.minTimestamp
|
88
|
+
max_t = tg.maxTimestamp
|
89
|
+
|
90
|
+
for tier in tiers.keys():
|
91
|
+
entries = []
|
92
|
+
for entry in tiers[tier]:
|
93
|
+
entries.append(Interval(entry.start, entry.end, entry.text))
|
94
|
+
if tier == 'phones':
|
95
|
+
name = 'annotation_phonemes'
|
96
|
+
else:
|
97
|
+
name = 'annotation_' + tier
|
98
|
+
tg.addTier(textgrid.IntervalTier(name, entries, min_t, max_t))
|
99
|
+
|
100
|
+
tg.save(str(file), format='long_textgrid', includeBlankSpaces=True)
|
sonusai/speech/timit.py
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .types import TimeAlignedType
|
6
|
+
|
7
|
+
|
8
|
+
def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
|
9
|
+
"""Load time-aligned text data given a TIMIT audio file.
|
10
|
+
|
11
|
+
:param audio: Path to the TIMIT audio file.
|
12
|
+
:return: A TimeAlignedType object.
|
13
|
+
"""
|
14
|
+
import string
|
15
|
+
|
16
|
+
from sonusai.mixture import get_sample_rate
|
17
|
+
|
18
|
+
file = Path(audio).with_suffix('.TXT')
|
19
|
+
if not os.path.exists(file):
|
20
|
+
return None
|
21
|
+
|
22
|
+
sample_rate = get_sample_rate(str(audio))
|
23
|
+
|
24
|
+
with open(file, mode='r', encoding='utf-8') as f:
|
25
|
+
line = f.read()
|
26
|
+
|
27
|
+
fields = line.strip().split()
|
28
|
+
start = int(fields[0]) / sample_rate
|
29
|
+
end = int(fields[1]) / sample_rate
|
30
|
+
text = ' '.join(fields[2:]).lower().translate(str.maketrans('', '', string.punctuation))
|
31
|
+
|
32
|
+
return TimeAlignedType(start, end, text)
|
33
|
+
|
34
|
+
|
35
|
+
def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
|
36
|
+
"""Load time-aligned word data given a TIMIT audio file.
|
37
|
+
|
38
|
+
:param audio: Path to the TIMIT audio file.
|
39
|
+
:return: A list of TimeAlignedType objects.
|
40
|
+
"""
|
41
|
+
|
42
|
+
return _load_ta(audio, 'words')
|
43
|
+
|
44
|
+
|
45
|
+
def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
|
46
|
+
"""Load time-aligned phonemes data given a TIMIT audio file.
|
47
|
+
|
48
|
+
:param audio: Path to the TIMIT audio file.
|
49
|
+
:return: A list of TimeAlignedType objects.
|
50
|
+
"""
|
51
|
+
|
52
|
+
return _load_ta(audio, 'phonemes')
|
53
|
+
|
54
|
+
|
55
|
+
def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
|
56
|
+
from sonusai.mixture import get_sample_rate
|
57
|
+
|
58
|
+
if tier == 'words':
|
59
|
+
file = Path(audio).with_suffix('.WRD')
|
60
|
+
elif tier == 'phonemes':
|
61
|
+
file = Path(audio).with_suffix('.PHN')
|
62
|
+
else:
|
63
|
+
raise ValueError(f'Unknown tier: {tier}')
|
64
|
+
|
65
|
+
if not os.path.exists(file):
|
66
|
+
return None
|
67
|
+
|
68
|
+
sample_rate = get_sample_rate(str(audio))
|
69
|
+
|
70
|
+
entries: list[TimeAlignedType] = []
|
71
|
+
first = True
|
72
|
+
with open(file, mode='r', encoding='utf-8') as f:
|
73
|
+
for line in f.readlines():
|
74
|
+
fields = line.strip().split()
|
75
|
+
start = int(fields[0]) / sample_rate
|
76
|
+
end = int(fields[1]) / sample_rate
|
77
|
+
text = ' '.join(fields[2:])
|
78
|
+
|
79
|
+
if first:
|
80
|
+
first = False
|
81
|
+
else:
|
82
|
+
if start < entries[-1].end:
|
83
|
+
start = entries[-1].end - (entries[- 1].end - start) // 2
|
84
|
+
entries[-1] = TimeAlignedType(text=entries[-1].text, start=entries[-1].start, end=start)
|
85
|
+
|
86
|
+
if end <= start:
|
87
|
+
end = start + 1 / sample_rate
|
88
|
+
|
89
|
+
entries.append(TimeAlignedType(text=text, start=start, end=end))
|
90
|
+
|
91
|
+
return entries
|
92
|
+
|
93
|
+
|
94
|
+
def _years_between(record, born):
|
95
|
+
try:
|
96
|
+
rec_fields = [int(x) for x in record.split('/')]
|
97
|
+
brn_fields = [int(x) for x in born.split('/')]
|
98
|
+
return rec_fields[2] - brn_fields[2] - ((rec_fields[1], rec_fields[0]) < (brn_fields[1], brn_fields[0]))
|
99
|
+
except ValueError:
|
100
|
+
return '??'
|
101
|
+
|
102
|
+
|
103
|
+
def _decode_dialect(d: str) -> str:
|
104
|
+
if d in ['DR1', '1']:
|
105
|
+
return 'New England'
|
106
|
+
if d in ['DR2', '2']:
|
107
|
+
return 'Northern'
|
108
|
+
if d in ['DR3', '3']:
|
109
|
+
return 'North Midland'
|
110
|
+
if d in ['DR4', '4']:
|
111
|
+
return 'South Midland'
|
112
|
+
if d in ['DR5', '5']:
|
113
|
+
return 'Southern'
|
114
|
+
if d in ['DR6', '6']:
|
115
|
+
return 'New York City'
|
116
|
+
if d in ['DR7', '7']:
|
117
|
+
return 'Western'
|
118
|
+
if d in ['DR8', '8']:
|
119
|
+
return 'Army Brat'
|
120
|
+
|
121
|
+
raise ValueError(f'Unrecognized dialect: {d}')
|
122
|
+
|
123
|
+
|
124
|
+
def load_speakers(input_dir: Path) -> dict:
|
125
|
+
speakers = {}
|
126
|
+
with open(input_dir / 'SPKRINFO.TXT') as file:
|
127
|
+
for line in file:
|
128
|
+
if not line.startswith(';'):
|
129
|
+
fields = line.strip().split()
|
130
|
+
speaker_id = fields[0]
|
131
|
+
gender = fields[1]
|
132
|
+
dialect = _decode_dialect(fields[2])
|
133
|
+
age = _years_between(fields[4], fields[5])
|
134
|
+
speakers[speaker_id] = {'gender': gender, 'dialect': dialect, 'age': age}
|
135
|
+
return speakers
|
sonusai/speech/types.py
ADDED
sonusai/speech/vctk.py
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from .types import TimeAlignedType
|
6
|
+
|
7
|
+
|
8
|
+
def _get_duration(name: str) -> float:
|
9
|
+
import soundfile
|
10
|
+
|
11
|
+
from sonusai import SonusAIError
|
12
|
+
|
13
|
+
try:
|
14
|
+
return soundfile.info(name).duration
|
15
|
+
except Exception as e:
|
16
|
+
raise SonusAIError(f'Error reading {name}: {e}')
|
17
|
+
|
18
|
+
|
19
|
+
def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
|
20
|
+
"""Load time-aligned text data given a VCTK audio file.
|
21
|
+
|
22
|
+
:param audio: Path to the VCTK audio file.
|
23
|
+
:return: A TimeAlignedType object.
|
24
|
+
"""
|
25
|
+
import string
|
26
|
+
|
27
|
+
file = Path(audio).parents[2] / 'txt' / Path(audio).parent.name / (Path(audio).stem[:-5] + '.txt')
|
28
|
+
if not os.path.exists(file):
|
29
|
+
return None
|
30
|
+
|
31
|
+
with open(file, mode='r', encoding='utf-8') as f:
|
32
|
+
line = f.read()
|
33
|
+
|
34
|
+
start = 0
|
35
|
+
end = _get_duration(str(audio))
|
36
|
+
text = line.strip().lower().translate(str.maketrans('', '', string.punctuation))
|
37
|
+
|
38
|
+
return TimeAlignedType(start, end, text)
|
39
|
+
|
40
|
+
|
41
|
+
def load_speakers(input_dir: Path) -> dict:
|
42
|
+
speakers = {}
|
43
|
+
with open(input_dir / 'speaker-info.txt') as file:
|
44
|
+
for line in file:
|
45
|
+
if not line.startswith('ID'):
|
46
|
+
fields = line.strip().split('(', 1)[0].split()
|
47
|
+
speaker_id = fields[0]
|
48
|
+
age = fields[1]
|
49
|
+
gender = fields[2]
|
50
|
+
dialect = ' '.join([field for field in fields[3:]])
|
51
|
+
speakers[speaker_id] = {'gender': gender, 'dialect': dialect, 'age': age}
|
52
|
+
return speakers
|
@@ -0,0 +1,102 @@
|
|
1
|
+
import os
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
|
6
|
+
@dataclass(frozen=True)
|
7
|
+
class Segment:
|
8
|
+
person: str
|
9
|
+
video: str
|
10
|
+
id: str
|
11
|
+
start: float
|
12
|
+
stop: float
|
13
|
+
|
14
|
+
|
15
|
+
def load_speakers(input_dir: Path) -> dict:
|
16
|
+
import csv
|
17
|
+
|
18
|
+
speakers = {}
|
19
|
+
|
20
|
+
# VoxCeleb1
|
21
|
+
first = True
|
22
|
+
with open(input_dir / 'vox1_meta.csv', newline='') as file:
|
23
|
+
data = csv.reader(file, delimiter='\t')
|
24
|
+
for row in data:
|
25
|
+
if first:
|
26
|
+
first = False
|
27
|
+
else:
|
28
|
+
speakers[row[0].strip()] = {'gender': row[2].strip(),
|
29
|
+
'dialect': row[3].strip(),
|
30
|
+
'category': row[4].strip()}
|
31
|
+
|
32
|
+
# VoxCeleb2
|
33
|
+
first = True
|
34
|
+
with open(input_dir / 'vox2_meta.csv', newline='') as file:
|
35
|
+
data = csv.reader(file, delimiter='\t')
|
36
|
+
for row in data:
|
37
|
+
if first:
|
38
|
+
first = False
|
39
|
+
else:
|
40
|
+
speakers[row[1].strip()] = {'gender': row[3].strip(),
|
41
|
+
'category': row[4].strip()}
|
42
|
+
|
43
|
+
return speakers
|
44
|
+
|
45
|
+
|
46
|
+
def load_segment(path: str | os.PathLike[str]) -> Segment:
|
47
|
+
path = Path(path)
|
48
|
+
|
49
|
+
with path.open('r') as file:
|
50
|
+
segment = file.read().strip()
|
51
|
+
|
52
|
+
header, frames = segment.split('\n\n')
|
53
|
+
header_dict = _parse_header(header)
|
54
|
+
start, stop = _get_segment_boundaries(frames)
|
55
|
+
|
56
|
+
return Segment(person=header_dict['Identity'],
|
57
|
+
video=header_dict['Reference'],
|
58
|
+
id=path.stem,
|
59
|
+
start=start,
|
60
|
+
stop=stop)
|
61
|
+
|
62
|
+
|
63
|
+
def _parse_header(header: str) -> dict:
|
64
|
+
def _parse_line(line: str) -> tuple[str, str]:
|
65
|
+
"""Parse a line of header text into a dictionary.
|
66
|
+
|
67
|
+
Header text has the following format:
|
68
|
+
|
69
|
+
Identity : \tid00017
|
70
|
+
Reference : \t7t6lfzvVaTM
|
71
|
+
Offset : \t1
|
72
|
+
FV Conf : \t16.647\t(1)
|
73
|
+
ASD Conf : \t4.465
|
74
|
+
|
75
|
+
"""
|
76
|
+
k, v = line.split('\t', maxsplit=1)
|
77
|
+
k = k[:-2].strip()
|
78
|
+
v = v.strip()
|
79
|
+
return k, v
|
80
|
+
|
81
|
+
return dict(_parse_line(line) for line in header.split('\n'))
|
82
|
+
|
83
|
+
|
84
|
+
def _get_segment_boundaries(frames: str) -> tuple[float, float]:
|
85
|
+
"""Get the start and stop points of the segment.
|
86
|
+
|
87
|
+
Frames text has the following format:
|
88
|
+
|
89
|
+
FRAME X Y W H
|
90
|
+
000245 0.392 0.223 0.253 0.451
|
91
|
+
...
|
92
|
+
000470 0.359 0.207 0.260 0.463
|
93
|
+
|
94
|
+
"""
|
95
|
+
|
96
|
+
def _get_frame_seconds(line: str) -> float:
|
97
|
+
frame = int(line.split('\t')[0])
|
98
|
+
# YouTube is 25 FPS
|
99
|
+
return frame / 25
|
100
|
+
|
101
|
+
lines = frames.split('\n')
|
102
|
+
return _get_frame_seconds(lines[1]), _get_frame_seconds(lines[-1])
|