sonusai 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. sonusai/audiofe.py +22 -51
  2. sonusai/calc_metric_spenh.py +206 -213
  3. sonusai/doc/doc.py +1 -1
  4. sonusai/mixture/__init__.py +2 -0
  5. sonusai/mixture/audio.py +12 -0
  6. sonusai/mixture/datatypes.py +11 -3
  7. sonusai/mixture/mixdb.py +101 -0
  8. sonusai/mixture/soundfile_audio.py +39 -0
  9. sonusai/mixture/speaker_metadata.py +35 -0
  10. sonusai/mixture/torchaudio_audio.py +22 -0
  11. sonusai/mkmanifest.py +1 -1
  12. sonusai/onnx_predict.py +114 -410
  13. sonusai/queries/queries.py +1 -1
  14. sonusai/speech/__init__.py +3 -0
  15. sonusai/speech/l2arctic.py +116 -0
  16. sonusai/speech/librispeech.py +99 -0
  17. sonusai/speech/mcgill.py +70 -0
  18. sonusai/speech/textgrid.py +100 -0
  19. sonusai/speech/timit.py +135 -0
  20. sonusai/speech/types.py +12 -0
  21. sonusai/speech/vctk.py +52 -0
  22. sonusai/speech/voxceleb2.py +86 -0
  23. sonusai/utils/__init__.py +2 -1
  24. sonusai/utils/asr_manifest_functions/__init__.py +0 -1
  25. sonusai/utils/asr_manifest_functions/data.py +0 -8
  26. sonusai/utils/asr_manifest_functions/librispeech.py +1 -1
  27. sonusai/utils/asr_manifest_functions/mcgill_speech.py +1 -1
  28. sonusai/utils/asr_manifest_functions/vctk_noisy_speech.py +1 -1
  29. sonusai/utils/braced_glob.py +7 -3
  30. sonusai/utils/onnx_utils.py +110 -106
  31. sonusai/utils/path_info.py +7 -0
  32. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/METADATA +2 -1
  33. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/RECORD +35 -30
  34. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/WHEEL +1 -1
  35. sonusai/calc_metric_spenh-save.py +0 -1334
  36. sonusai/onnx_predict-old.py +0 -240
  37. sonusai/onnx_predict-save.py +0 -487
  38. sonusai/ovino_predict.py +0 -508
  39. sonusai/ovino_query_devices.py +0 -47
  40. sonusai/torchl_onnx-old.py +0 -216
  41. {sonusai-0.17.0.dist-info → sonusai-0.17.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,116 @@
1
+ import os
2
+ import string
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from .types import TimeAlignedType
7
+
8
+
9
+ def _get_duration(name: str) -> float:
10
+ import soundfile
11
+
12
+ from sonusai import SonusAIError
13
+
14
+ try:
15
+ return soundfile.info(name).duration
16
+ except Exception as e:
17
+ raise SonusAIError(f'Error reading {name}: {e}')
18
+
19
+
20
+ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
21
+ """Load time-aligned text data given a L2-ARCTIC audio file.
22
+
23
+ :param audio: Path to the L2-ARCTIC audio file.
24
+ :return: A TimeAlignedType object.
25
+ """
26
+ file = Path(audio).parent.parent / 'transcript' / (Path(audio).stem + '.txt')
27
+ if not os.path.exists(file):
28
+ return None
29
+
30
+ with open(file, mode='r', encoding='utf-8') as f:
31
+ line = f.read()
32
+
33
+ return TimeAlignedType(0,
34
+ _get_duration(str(audio)),
35
+ line.strip().lower().translate(str.maketrans('', '', string.punctuation)))
36
+
37
+
38
+ def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
39
+ """Load time-aligned word data given a L2-ARCTIC audio file.
40
+
41
+ :param audio: Path to the L2-ARCTIC audio file.
42
+ :return: A list of TimeAlignedType objects.
43
+ """
44
+ return _load_ta(audio, 'words')
45
+
46
+
47
+ def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
48
+ """Load time-aligned phonemes data given a L2-ARCTIC audio file.
49
+
50
+ :param audio: Path to the L2-ARCTIC audio file.
51
+ :return: A list of TimeAlignedType objects.
52
+ """
53
+ return _load_ta(audio, 'phones')
54
+
55
+
56
+ def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
57
+ from praatio import textgrid
58
+
59
+ file = Path(audio).parent.parent / 'textgrid' / (Path(audio).stem + '.TextGrid')
60
+ if not os.path.exists(file):
61
+ return None
62
+
63
+ tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
64
+ if tier not in tg.tierNames:
65
+ return None
66
+
67
+ entries: list[TimeAlignedType] = []
68
+ for entry in tg.getTier(tier).entries:
69
+ entries.append(TimeAlignedType(text=entry.label, start=entry.start, end=entry.end))
70
+
71
+ return entries
72
+
73
+
74
+ def load_annotations(audio: str | os.PathLike[str]) -> Optional[dict[str, list[TimeAlignedType]]]:
75
+ """Load time-aligned annotation data given a L2-ARCTIC audio file.
76
+
77
+ :param audio: Path to the L2-ARCTIC audio file.
78
+ :return: A dictionary of a list of TimeAlignedType objects.
79
+ """
80
+ from praatio import textgrid
81
+
82
+ file = Path(audio).parent.parent / 'annotation' / (Path(audio).stem + '.TextGrid')
83
+ if not os.path.exists(file):
84
+ return None
85
+
86
+ tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
87
+ result: dict[str, list[TimeAlignedType]] = {}
88
+ for tier in tg.tierNames:
89
+ entries: list[TimeAlignedType] = []
90
+ for entry in tg.getTier(tier).entries:
91
+ entries.append(TimeAlignedType(text=entry.label, start=entry.start, end=entry.end))
92
+ result[tier] = entries
93
+
94
+ return result
95
+
96
+
97
+ def load_speakers(input_dir: Path) -> dict:
98
+ speakers = {}
99
+ with open(input_dir / 'readme-download.txt') as file:
100
+ processing = False
101
+ for line in file:
102
+ if not processing and line.startswith('|---|'):
103
+ processing = True
104
+ continue
105
+
106
+ if processing:
107
+ if line.startswith('|**Total**|'):
108
+ break
109
+ else:
110
+ fields = line.strip().split('|')
111
+ speaker_id = fields[1]
112
+ gender = fields[2]
113
+ dialect = fields[3]
114
+ speakers[speaker_id] = {'gender': gender, 'dialect': dialect}
115
+
116
+ return speakers
@@ -0,0 +1,99 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from .types import TimeAlignedType
6
+
7
+
8
+ def _get_num_samples(audio: str | os.PathLike[str]) -> int:
9
+ """Get number of samples from audio file using soundfile
10
+
11
+ :param audio: Audio file name
12
+ :return: Number of samples
13
+ """
14
+ import soundfile
15
+ from pydub import AudioSegment
16
+
17
+ if Path(audio).suffix == '.mp3':
18
+ return AudioSegment.from_mp3(audio).frame_count()
19
+
20
+ if Path(audio).suffix == '.m4a':
21
+ return AudioSegment.from_file(audio).frame_count()
22
+
23
+ return soundfile.info(audio).frames
24
+
25
+
26
+ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
27
+ """Load text data from a LibriSpeech transcription file given a LibriSpeech audio filename.
28
+
29
+ :param audio: Path to the LibriSpeech audio file.
30
+ :return: A TimeAlignedType object.
31
+ """
32
+ import string
33
+
34
+ from sonusai.mixture import get_sample_rate
35
+
36
+ path = Path(audio)
37
+ name = path.stem
38
+ transcript_filename = path.parent / f'{path.parent.parent.name}-{path.parent.name}.trans.txt'
39
+
40
+ if not os.path.exists(transcript_filename):
41
+ return None
42
+
43
+ with open(transcript_filename, mode='r', encoding='utf-8') as f:
44
+ for line in f.readlines():
45
+ fields = line.strip().split()
46
+ key = fields[0]
47
+ if key == name:
48
+ text = ' '.join(fields[1:]).lower().translate(str.maketrans('', '', string.punctuation))
49
+ return TimeAlignedType(0, _get_num_samples(audio) / get_sample_rate(str(audio)), text)
50
+
51
+ return None
52
+
53
+
54
+ def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
55
+ """Load time-aligned word data given a LibriSpeech audio file.
56
+
57
+ :param audio: Path to the Librispeech audio file.
58
+ :return: A list of TimeAlignedType objects.
59
+ """
60
+ return _load_ta(audio, 'words')
61
+
62
+
63
+ def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
64
+ """Load time-aligned phonemes data given a LibriSpeech audio file.
65
+
66
+ :param audio: Path to the LibriSpeech audio file.
67
+ :return: A list of TimeAlignedType objects.
68
+ """
69
+ return _load_ta(audio, 'phones')
70
+
71
+
72
+ def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
73
+ from praatio import textgrid
74
+
75
+ file = Path(audio).with_suffix('.TextGrid')
76
+ if not os.path.exists(file):
77
+ return None
78
+
79
+ tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
80
+ if tier not in tg.tierNames:
81
+ return None
82
+
83
+ entries: list[TimeAlignedType] = []
84
+ for entry in tg.getTier(tier).entries:
85
+ entries.append(TimeAlignedType(text=entry.label, start=entry.start, end=entry.end))
86
+
87
+ return entries
88
+
89
+
90
+ def load_speakers(input_dir: Path) -> dict:
91
+ speakers = {}
92
+ with open(input_dir / 'SPEAKERS.TXT') as file:
93
+ for line in file:
94
+ if not line.startswith(';'):
95
+ fields = line.strip().split('|')
96
+ speaker_id = fields[0].strip()
97
+ gender = fields[1].strip()
98
+ speakers[speaker_id] = {'gender': gender}
99
+ return speakers
@@ -0,0 +1,70 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ from .types import TimeAlignedType
5
+
6
+
7
+ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
8
+ """Load time-aligned text data given a McGill-Speech audio file.
9
+
10
+ :param audio: Path to the McGill-Speech audio file.
11
+ :return: A TimeAlignedType object.
12
+ """
13
+ import string
14
+ import struct
15
+
16
+ from sonusai.mixture import get_sample_rate
17
+
18
+ if not os.path.exists(audio):
19
+ return None
20
+
21
+ sample_rate = get_sample_rate(str(audio))
22
+
23
+ with open(audio, mode='rb') as f:
24
+ content = f.read()
25
+
26
+ riff_id, file_size, wave_id = struct.unpack('<4si4s', content[:12])
27
+ if riff_id.decode('utf-8') != 'RIFF':
28
+ return None
29
+
30
+ if wave_id.decode('utf-8') != 'WAVE':
31
+ return None
32
+
33
+ fmt_id, fmt_size = struct.unpack('<4si', content[12:20])
34
+
35
+ if fmt_id.decode('utf-8') != 'fmt ':
36
+ return None
37
+
38
+ if fmt_size != 16:
39
+ return None
40
+
41
+ (_wave_format_tag,
42
+ channels,
43
+ _samples_per_sec,
44
+ _avg_bytes_per_sec,
45
+ _block_align,
46
+ bits_per_sample) = struct.unpack('<hhiihh', content[20:36])
47
+
48
+ i = 36
49
+ samples = None
50
+ text = None
51
+ while i < file_size:
52
+ chunk_id = struct.unpack('<4s', content[i:i + 4])[0].decode('utf-8')
53
+ chunk_size = struct.unpack('<i', content[i + 4:i + 8])[0]
54
+
55
+ if chunk_id == 'data':
56
+ samples = chunk_size / channels / (bits_per_sample / 8)
57
+ break
58
+
59
+ if chunk_id == 'afsp':
60
+ chunks = struct.unpack(f'<{chunk_size}s', content[i + 8:i + 8 + chunk_size])[0]
61
+ chunks = chunks.decode('utf-8').split('\x00')
62
+ for chunk in chunks:
63
+ if chunk.startswith('text: "'):
64
+ text = chunk[7:-1].lower().translate(str.maketrans('', '', string.punctuation))
65
+ i += 8 + chunk_size + chunk_size % 2
66
+
67
+ if text and samples:
68
+ return TimeAlignedType(start=0, end=samples / sample_rate, text=text)
69
+
70
+ return None
@@ -0,0 +1,100 @@
1
+ from pathlib import Path
2
+
3
+ from praatio import textgrid
4
+ from praatio.utilities.constants import Interval
5
+
6
+ from .types import TimeAlignedType
7
+
8
+
9
+ def _get_duration(name: str) -> float:
10
+ from pydub import AudioSegment
11
+
12
+ from sonusai import SonusAIError
13
+
14
+ try:
15
+ return AudioSegment.from_file(name).duration_seconds
16
+ except Exception as e:
17
+ raise SonusAIError(f'Error reading {name}: {e}')
18
+
19
+
20
+ def create_textgrid(prompt: Path,
21
+ speaker_id: str,
22
+ speaker: dict,
23
+ output_dir: Path,
24
+ text: TimeAlignedType = None,
25
+ words: list[TimeAlignedType] = None,
26
+ phonemes: list[TimeAlignedType] = None) -> None:
27
+ if text is not None or words is not None or phonemes is not None:
28
+ min_t, max_t = _get_min_max({'phonemes': phonemes,
29
+ 'text': [text],
30
+ 'words': words})
31
+ else:
32
+ min_t = 0
33
+ max_t = _get_duration(str(prompt))
34
+
35
+ tg = textgrid.Textgrid()
36
+
37
+ tg.addTier(textgrid.IntervalTier('speaker_id', [Interval(min_t, max_t, speaker_id)], min_t, max_t))
38
+ for tier in speaker.keys():
39
+ tg.addTier(textgrid.IntervalTier(tier, [Interval(min_t, max_t, str(speaker[tier]))], min_t, max_t))
40
+
41
+ if text is not None:
42
+ entries = [Interval(text.start, text.end, text.text)]
43
+ text_tier = textgrid.IntervalTier('text', entries, min_t, max_t)
44
+ tg.addTier(text_tier)
45
+
46
+ if words is not None:
47
+ entries = []
48
+ for word in words:
49
+ entries.append(Interval(word.start, word.end, word.text))
50
+ words_tier = textgrid.IntervalTier('words', entries, min_t, max_t)
51
+ tg.addTier(words_tier)
52
+
53
+ if phonemes is not None:
54
+ entries = []
55
+ for phoneme in phonemes:
56
+ entries.append(Interval(phoneme.start, phoneme.end, phoneme.text))
57
+ phonemes_tier = textgrid.IntervalTier('phonemes', entries, min_t, max_t)
58
+ tg.addTier(phonemes_tier)
59
+
60
+ output_filename = str(output_dir / prompt.stem) + '.TextGrid'
61
+ tg.save(output_filename, format='long_textgrid', includeBlankSpaces=True)
62
+
63
+
64
+ def _get_min_max(tiers: dict[str, list[TimeAlignedType]]) -> tuple[float, float]:
65
+ starts = []
66
+ ends = []
67
+ for tier in tiers.values():
68
+ if tier is not None:
69
+ starts.append(tier[0].start)
70
+ ends.append(tier[-1].end)
71
+
72
+ return min(starts), max(ends)
73
+
74
+
75
+ def annotate_textgrid(tiers: dict[str, list[TimeAlignedType]], prompt: Path, output_dir: Path) -> None:
76
+ import os
77
+
78
+ if tiers is None:
79
+ return
80
+
81
+ file = Path(output_dir / prompt.stem).with_suffix('.TextGrid')
82
+ if not os.path.exists(file):
83
+ tg = textgrid.Textgrid()
84
+ min_t, max_t = _get_min_max(tiers)
85
+ else:
86
+ tg = textgrid.openTextgrid(str(file), includeEmptyIntervals=False)
87
+ min_t = tg.minTimestamp
88
+ max_t = tg.maxTimestamp
89
+
90
+ for tier in tiers.keys():
91
+ entries = []
92
+ for entry in tiers[tier]:
93
+ entries.append(Interval(entry.start, entry.end, entry.text))
94
+ if tier == 'phones':
95
+ name = 'annotation_phonemes'
96
+ else:
97
+ name = 'annotation_' + tier
98
+ tg.addTier(textgrid.IntervalTier(name, entries, min_t, max_t))
99
+
100
+ tg.save(str(file), format='long_textgrid', includeBlankSpaces=True)
@@ -0,0 +1,135 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from .types import TimeAlignedType
6
+
7
+
8
+ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
9
+ """Load time-aligned text data given a TIMIT audio file.
10
+
11
+ :param audio: Path to the TIMIT audio file.
12
+ :return: A TimeAlignedType object.
13
+ """
14
+ import string
15
+
16
+ from sonusai.mixture import get_sample_rate
17
+
18
+ file = Path(audio).with_suffix('.TXT')
19
+ if not os.path.exists(file):
20
+ return None
21
+
22
+ sample_rate = get_sample_rate(str(audio))
23
+
24
+ with open(file, mode='r', encoding='utf-8') as f:
25
+ line = f.read()
26
+
27
+ fields = line.strip().split()
28
+ start = int(fields[0]) / sample_rate
29
+ end = int(fields[1]) / sample_rate
30
+ text = ' '.join(fields[2:]).lower().translate(str.maketrans('', '', string.punctuation))
31
+
32
+ return TimeAlignedType(start, end, text)
33
+
34
+
35
+ def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
36
+ """Load time-aligned word data given a TIMIT audio file.
37
+
38
+ :param audio: Path to the TIMIT audio file.
39
+ :return: A list of TimeAlignedType objects.
40
+ """
41
+
42
+ return _load_ta(audio, 'words')
43
+
44
+
45
+ def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
46
+ """Load time-aligned phonemes data given a TIMIT audio file.
47
+
48
+ :param audio: Path to the TIMIT audio file.
49
+ :return: A list of TimeAlignedType objects.
50
+ """
51
+
52
+ return _load_ta(audio, 'phonemes')
53
+
54
+
55
+ def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
56
+ from sonusai.mixture import get_sample_rate
57
+
58
+ if tier == 'words':
59
+ file = Path(audio).with_suffix('.WRD')
60
+ elif tier == 'phonemes':
61
+ file = Path(audio).with_suffix('.PHN')
62
+ else:
63
+ raise ValueError(f'Unknown tier: {tier}')
64
+
65
+ if not os.path.exists(file):
66
+ return None
67
+
68
+ sample_rate = get_sample_rate(str(audio))
69
+
70
+ entries: list[TimeAlignedType] = []
71
+ first = True
72
+ with open(file, mode='r', encoding='utf-8') as f:
73
+ for line in f.readlines():
74
+ fields = line.strip().split()
75
+ start = int(fields[0]) / sample_rate
76
+ end = int(fields[1]) / sample_rate
77
+ text = ' '.join(fields[2:])
78
+
79
+ if first:
80
+ first = False
81
+ else:
82
+ if start < entries[-1].end:
83
+ start = entries[-1].end - (entries[- 1].end - start) // 2
84
+ entries[-1] = TimeAlignedType(text=entries[-1].text, start=entries[-1].start, end=start)
85
+
86
+ if end <= start:
87
+ end = start + 1 / sample_rate
88
+
89
+ entries.append(TimeAlignedType(text=text, start=start, end=end))
90
+
91
+ return entries
92
+
93
+
94
+ def _years_between(record, born):
95
+ try:
96
+ rec_fields = [int(x) for x in record.split('/')]
97
+ brn_fields = [int(x) for x in born.split('/')]
98
+ return rec_fields[2] - brn_fields[2] - ((rec_fields[1], rec_fields[0]) < (brn_fields[1], brn_fields[0]))
99
+ except ValueError:
100
+ return '??'
101
+
102
+
103
+ def _decode_dialect(d: str) -> str:
104
+ if d in ['DR1', '1']:
105
+ return 'New England'
106
+ if d in ['DR2', '2']:
107
+ return 'Northern'
108
+ if d in ['DR3', '3']:
109
+ return 'North Midland'
110
+ if d in ['DR4', '4']:
111
+ return 'South Midland'
112
+ if d in ['DR5', '5']:
113
+ return 'Southern'
114
+ if d in ['DR6', '6']:
115
+ return 'New York City'
116
+ if d in ['DR7', '7']:
117
+ return 'Western'
118
+ if d in ['DR8', '8']:
119
+ return 'Army Brat'
120
+
121
+ raise ValueError(f'Unrecognized dialect: {d}')
122
+
123
+
124
+ def load_speakers(input_dir: Path) -> dict:
125
+ speakers = {}
126
+ with open(input_dir / 'SPKRINFO.TXT') as file:
127
+ for line in file:
128
+ if not line.startswith(';'):
129
+ fields = line.strip().split()
130
+ speaker_id = fields[0]
131
+ gender = fields[1]
132
+ dialect = _decode_dialect(fields[2])
133
+ age = _years_between(fields[4], fields[5])
134
+ speakers[speaker_id] = {'gender': gender, 'dialect': dialect, 'age': age}
135
+ return speakers
@@ -0,0 +1,12 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass(frozen=True)
5
+ class TimeAlignedType:
6
+ start: float
7
+ end: float
8
+ text: str
9
+
10
+ @property
11
+ def duration(self) -> float:
12
+ return self.end - self.start
sonusai/speech/vctk.py ADDED
@@ -0,0 +1,52 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from .types import TimeAlignedType
6
+
7
+
8
+ def _get_duration(name: str) -> float:
9
+ import soundfile
10
+
11
+ from sonusai import SonusAIError
12
+
13
+ try:
14
+ return soundfile.info(name).duration
15
+ except Exception as e:
16
+ raise SonusAIError(f'Error reading {name}: {e}')
17
+
18
+
19
+ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
20
+ """Load time-aligned text data given a VCTK audio file.
21
+
22
+ :param audio: Path to the VCTK audio file.
23
+ :return: A TimeAlignedType object.
24
+ """
25
+ import string
26
+
27
+ file = Path(audio).parents[2] / 'txt' / Path(audio).parent.name / (Path(audio).stem[:-5] + '.txt')
28
+ if not os.path.exists(file):
29
+ return None
30
+
31
+ with open(file, mode='r', encoding='utf-8') as f:
32
+ line = f.read()
33
+
34
+ start = 0
35
+ end = _get_duration(str(audio))
36
+ text = line.strip().lower().translate(str.maketrans('', '', string.punctuation))
37
+
38
+ return TimeAlignedType(start, end, text)
39
+
40
+
41
+ def load_speakers(input_dir: Path) -> dict:
42
+ speakers = {}
43
+ with open(input_dir / 'speaker-info.txt') as file:
44
+ for line in file:
45
+ if not line.startswith('ID'):
46
+ fields = line.strip().split('(', 1)[0].split()
47
+ speaker_id = fields[0]
48
+ age = fields[1]
49
+ gender = fields[2]
50
+ dialect = ' '.join([field for field in fields[3:]])
51
+ speakers[speaker_id] = {'gender': gender, 'dialect': dialect, 'age': age}
52
+ return speakers
@@ -0,0 +1,86 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class Segment:
8
+ person: str
9
+ video: str
10
+ id: str
11
+ start: float
12
+ stop: float
13
+
14
+
15
+ def load_speakers(input_dir: Path) -> dict:
16
+ import csv
17
+
18
+ speakers = {}
19
+ first = True
20
+ with open(input_dir / 'vox2_meta_cleansed.csv', newline='') as file:
21
+ data = csv.reader(file)
22
+ for row in data:
23
+ if first:
24
+ first = False
25
+ else:
26
+ speakers[row[0].strip()] = {'gender': row[2].strip(), 'category': row[3].strip()}
27
+ return speakers
28
+
29
+
30
+ def load_segment(path: str | os.PathLike[str]) -> Segment:
31
+ path = Path(path)
32
+
33
+ with path.open('r') as file:
34
+ segment = file.read().strip()
35
+
36
+ header, frames = segment.split('\n\n')
37
+ header_dict = _parse_header(header)
38
+ start, stop = _get_segment_boundaries(frames)
39
+
40
+ return Segment(person=header_dict['Identity'],
41
+ video=header_dict['Reference'],
42
+ id=path.stem,
43
+ start=start,
44
+ stop=stop)
45
+
46
+
47
+ def _parse_header(header: str) -> dict:
48
+ def _parse_line(line: str) -> tuple[str, str]:
49
+ """Parse a line of header text into a dictionary.
50
+
51
+ Header text has the following format:
52
+
53
+ Identity : \tid00017
54
+ Reference : \t7t6lfzvVaTM
55
+ Offset : \t1
56
+ FV Conf : \t16.647\t(1)
57
+ ASD Conf : \t4.465
58
+
59
+ """
60
+ k, v = line.split('\t', maxsplit=1)
61
+ k = k[:-2].strip()
62
+ v = v.strip()
63
+ return k, v
64
+
65
+ return dict(_parse_line(line) for line in header.split('\n'))
66
+
67
+
68
+ def _get_segment_boundaries(frames: str) -> tuple[float, float]:
69
+ """Get the start and stop points of the segment.
70
+
71
+ Frames text has the following format:
72
+
73
+ FRAME X Y W H
74
+ 000245 0.392 0.223 0.253 0.451
75
+ ...
76
+ 000470 0.359 0.207 0.260 0.463
77
+
78
+ """
79
+
80
+ def _get_frame_seconds(line: str) -> float:
81
+ frame = int(line.split('\t')[0])
82
+ # YouTube is 25 FPS
83
+ return frame / 25
84
+
85
+ lines = frames.split('\n')
86
+ return _get_frame_seconds(lines[1]), _get_frame_seconds(lines[-1])
sonusai/utils/__init__.py CHANGED
@@ -27,11 +27,12 @@ from .max_text_width import max_text_width
27
27
  from .model_utils import import_module
28
28
  from .numeric_conversion import float_to_int16
29
29
  from .numeric_conversion import int16_to_float
30
- from .onnx_utils import SonusAIMetaData
31
30
  from .onnx_utils import add_sonusai_metadata
32
31
  from .onnx_utils import get_sonusai_metadata
32
+ from .onnx_utils import load_ort_session
33
33
  from .parallel import pp_imap
34
34
  from .parallel import pp_tqdm_imap
35
+ from .path_info import PathInfo
35
36
  from .print_mixture_details import print_class_count
36
37
  from .print_mixture_details import print_mixture_details
37
38
  from .ranges import consolidate_range
@@ -1,4 +1,3 @@
1
- from .data import PathInfo
2
1
  from .data import TranscriptData
3
2
  from .librispeech import collect_librispeech_transcripts
4
3
  from .librispeech import get_librispeech_manifest_entry