PyPI - sonusai - Versions diffs - 0.18.9__py3-none-any.whl → 0.19.5__py3-none-any.whl - Mend

sonusai 0.18.9py3-none-any.whl → 0.19.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sonusai/__init__.py +20 -29
sonusai/aawscd_probwrite.py +18 -18
sonusai/audiofe.py +93 -80
sonusai/calc_metric_spenh.py +395 -321
sonusai/data/genmixdb.yml +5 -11
sonusai/{gentcst.py → deprecated/gentcst.py} +146 -149
sonusai/{plot.py → deprecated/plot.py} +177 -131
sonusai/{tplot.py → deprecated/tplot.py} +124 -102
sonusai/doc/__init__.py +1 -1
sonusai/doc/doc.py +112 -177
sonusai/doc.py +10 -10
sonusai/genft.py +93 -77
sonusai/genmetrics.py +59 -46
sonusai/genmix.py +116 -104
sonusai/genmixdb.py +194 -153
sonusai/lsdb.py +56 -66
sonusai/main.py +23 -20
sonusai/metrics/__init__.py +2 -0
sonusai/metrics/calc_audio_stats.py +29 -24
sonusai/metrics/calc_class_weights.py +7 -7
sonusai/metrics/calc_optimal_thresholds.py +5 -7
sonusai/metrics/calc_pcm.py +3 -3
sonusai/metrics/calc_pesq.py +10 -7
sonusai/metrics/calc_phase_distance.py +3 -3
sonusai/metrics/calc_sa_sdr.py +10 -8
sonusai/metrics/calc_segsnr_f.py +15 -17
sonusai/metrics/calc_speech.py +105 -47
sonusai/metrics/calc_wer.py +35 -32
sonusai/metrics/calc_wsdr.py +10 -7
sonusai/metrics/class_summary.py +30 -27
sonusai/metrics/confusion_matrix_summary.py +25 -22
sonusai/metrics/one_hot.py +91 -57
sonusai/metrics/snr_summary.py +53 -46
sonusai/mixture/__init__.py +19 -14
sonusai/mixture/audio.py +4 -6
sonusai/mixture/augmentation.py +37 -43
sonusai/mixture/class_count.py +5 -14
sonusai/mixture/config.py +292 -225
sonusai/mixture/constants.py +41 -30
sonusai/mixture/data_io.py +155 -0
sonusai/mixture/datatypes.py +111 -108
sonusai/mixture/db_datatypes.py +54 -70
sonusai/mixture/eq_rule_is_valid.py +6 -9
sonusai/mixture/feature.py +40 -38
sonusai/mixture/generation.py +522 -389
sonusai/mixture/helpers.py +217 -272
sonusai/mixture/log_duration_and_sizes.py +16 -13
sonusai/mixture/mixdb.py +669 -477
sonusai/mixture/soundfile_audio.py +12 -17
sonusai/mixture/sox_audio.py +91 -112
sonusai/mixture/sox_augmentation.py +8 -9
sonusai/mixture/spectral_mask.py +4 -6
sonusai/mixture/target_class_balancing.py +41 -36
sonusai/mixture/targets.py +69 -67
sonusai/mixture/tokenized_shell_vars.py +23 -23
sonusai/mixture/torchaudio_audio.py +14 -15
sonusai/mixture/torchaudio_augmentation.py +23 -27
sonusai/mixture/truth.py +48 -26
sonusai/mixture/truth_functions/__init__.py +26 -0
sonusai/mixture/truth_functions/crm.py +56 -38
sonusai/mixture/truth_functions/datatypes.py +37 -0
sonusai/mixture/truth_functions/energy.py +85 -59
sonusai/mixture/truth_functions/file.py +30 -30
sonusai/mixture/truth_functions/phoneme.py +14 -7
sonusai/mixture/truth_functions/sed.py +71 -45
sonusai/mixture/truth_functions/target.py +69 -106
sonusai/mkwav.py +52 -85
sonusai/onnx_predict.py +46 -43
sonusai/queries/__init__.py +3 -1
sonusai/queries/queries.py +100 -59
sonusai/speech/__init__.py +2 -0
sonusai/speech/l2arctic.py +24 -23
sonusai/speech/librispeech.py +16 -17
sonusai/speech/mcgill.py +22 -21
sonusai/speech/textgrid.py +32 -25
sonusai/speech/timit.py +45 -42
sonusai/speech/vctk.py +14 -13
sonusai/speech/voxceleb.py +26 -20
sonusai/summarize_metric_spenh.py +11 -10
sonusai/utils/__init__.py +4 -3
sonusai/utils/asl_p56.py +1 -1
sonusai/utils/asr.py +37 -17
sonusai/utils/asr_functions/__init__.py +2 -0
sonusai/utils/asr_functions/aaware_whisper.py +18 -12
sonusai/utils/audio_devices.py +12 -12
sonusai/utils/braced_glob.py +6 -8
sonusai/utils/calculate_input_shape.py +1 -4
sonusai/utils/compress.py +2 -2
sonusai/utils/convert_string_to_number.py +1 -3
sonusai/utils/create_timestamp.py +1 -1
sonusai/utils/create_ts_name.py +2 -2
sonusai/utils/dataclass_from_dict.py +1 -1
sonusai/utils/docstring.py +6 -6
sonusai/utils/energy_f.py +9 -7
sonusai/utils/engineering_number.py +56 -54
sonusai/utils/get_label_names.py +8 -10
sonusai/utils/human_readable_size.py +2 -2
sonusai/utils/model_utils.py +3 -5
sonusai/utils/numeric_conversion.py +2 -4
sonusai/utils/onnx_utils.py +43 -32
sonusai/utils/parallel.py +40 -27
sonusai/utils/print_mixture_details.py +25 -22
sonusai/utils/ranges.py +12 -12
sonusai/utils/read_predict_data.py +11 -9
sonusai/utils/reshape.py +19 -26
sonusai/utils/seconds_to_hms.py +1 -1
sonusai/utils/stacked_complex.py +8 -16
sonusai/utils/stratified_shuffle_split.py +29 -27
sonusai/utils/write_audio.py +2 -2
sonusai/utils/yes_or_no.py +3 -3
sonusai/vars.py +14 -14
{sonusai-0.18.9.dist-info → sonusai-0.19.5.dist-info}/METADATA +20 -21
sonusai-0.19.5.dist-info/RECORD +125 -0
{sonusai-0.18.9.dist-info → sonusai-0.19.5.dist-info}/WHEEL +1 -1
sonusai/mixture/truth_functions/data.py +0 -58
sonusai/utils/read_mixture_data.py +0 -14
sonusai-0.18.9.dist-info/RECORD +0 -125
{sonusai-0.18.9.dist-info → sonusai-0.19.5.dist-info}/entry_points.txt +0 -0

sonusai/speech/librispeech.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 from pathlib import Path
-from typing import Optional
 from .types import TimeAlignedType
@@ -14,16 +13,16 @@ def _get_num_samples(audio: str | os.PathLike[str]) -> int:
     import soundfile
     from pydub import AudioSegment
-    if Path(audio).suffix == '.mp3':
+    if Path(audio).suffix == ".mp3":
         return AudioSegment.from_mp3(audio).frame_count()
-    if Path(audio).suffix == '.m4a':
+    if Path(audio).suffix == ".m4a":
         return AudioSegment.from_file(audio).frame_count()
     return soundfile.info(audio).frames
-def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
+def load_text(audio: str | os.PathLike[str]) -> TimeAlignedType | None:
     """Load text data from a LibriSpeech transcription file given a LibriSpeech audio filename.
     :param audio: Path to the LibriSpeech audio file.
@@ -35,44 +34,44 @@ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
     path = Path(audio)
     name = path.stem
-    transcript_filename = path.parent / f'{path.parent.parent.name}-{path.parent.name}.trans.txt'
+    transcript_filename = path.parent / f"{path.parent.parent.name}-{path.parent.name}.trans.txt"
     if not os.path.exists(transcript_filename):
         return None
-    with open(transcript_filename, mode='r', encoding='utf-8') as f:
+    with open(transcript_filename, encoding="utf-8") as f:
         for line in f.readlines():
             fields = line.strip().split()
             key = fields[0]
             if key == name:
-                text = ' '.join(fields[1:]).lower().translate(str.maketrans('', '', string.punctuation))
+                text = " ".join(fields[1:]).lower().translate(str.maketrans("", "", string.punctuation))
                 return TimeAlignedType(0, _get_num_samples(audio) / get_sample_rate(str(audio)), text)
     return None
-def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
+def load_words(audio: str | os.PathLike[str]) -> list[TimeAlignedType] | None:
     """Load time-aligned word data given a LibriSpeech audio file.
     :param audio: Path to the Librispeech audio file.
     :return: A list of TimeAlignedType objects.
     """
-    return _load_ta(audio, 'words')
+    return _load_ta(audio, "words")
-def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
+def load_phonemes(audio: str | os.PathLike[str]) -> list[TimeAlignedType] | None:
     """Load time-aligned phonemes data given a LibriSpeech audio file.
     :param audio: Path to the LibriSpeech audio file.
     :return: A list of TimeAlignedType objects.
     """
-    return _load_ta(audio, 'phones')
+    return _load_ta(audio, "phones")
-def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
+def _load_ta(audio: str | os.PathLike[str], tier: str) -> list[TimeAlignedType] | None:
     from praatio import textgrid
-    file = Path(audio).with_suffix('.TextGrid')
+    file = Path(audio).with_suffix(".TextGrid")
     if not os.path.exists(file):
         return None
@@ -89,11 +88,11 @@ def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlig
 def load_speakers(input_dir: Path) -> dict:
     speakers = {}
-    with open(input_dir / 'SPEAKERS.TXT') as file:
+    with open(input_dir / "SPEAKERS.TXT") as file:
         for line in file:
-            if not line.startswith(';'):
-                fields = line.strip().split('|')
+            if not line.startswith(";"):
+                fields = line.strip().split("|")
                 speaker_id = fields[0].strip()
                 gender = fields[1].strip()
-                speakers[speaker_id] = {'gender': gender}
+                speakers[speaker_id] = {"gender": gender}
     return speakers

sonusai/speech/mcgill.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
-from typing import Optional
 from .types import TimeAlignedType
-def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
+def load_text(audio: str | os.PathLike[str]) -> TimeAlignedType | None:
     """Load time-aligned text data given a McGill-Speech audio file.
     :param audio: Path to the McGill-Speech audio file.
@@ -20,48 +19,50 @@ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
     sample_rate = get_sample_rate(str(audio))
-    with open(audio, mode='rb') as f:
+    with open(audio, mode="rb") as f:
         content = f.read()
-    riff_id, file_size, wave_id = struct.unpack('<4si4s', content[:12])
-    if riff_id.decode('utf-8') != 'RIFF':
+    riff_id, file_size, wave_id = struct.unpack("<4si4s", content[:12])
+    if riff_id.decode("utf-8") != "RIFF":
         return None
-    if wave_id.decode('utf-8') != 'WAVE':
+    if wave_id.decode("utf-8") != "WAVE":
         return None
-    fmt_id, fmt_size = struct.unpack('<4si', content[12:20])
+    fmt_id, fmt_size = struct.unpack("<4si", content[12:20])
-    if fmt_id.decode('utf-8') != 'fmt ':
+    if fmt_id.decode("utf-8") != "fmt ":
         return None
     if fmt_size != 16:
         return None
-    (_wave_format_tag,
-     channels,
-     _samples_per_sec,
-     _avg_bytes_per_sec,
-     _block_align,
-     bits_per_sample) = struct.unpack('<hhiihh', content[20:36])
+    (
+        _wave_format_tag,
+        channels,
+        _samples_per_sec,
+        _avg_bytes_per_sec,
+        _block_align,
+        bits_per_sample,
+    ) = struct.unpack("<hhiihh", content[20:36])
     i = 36
     samples = None
     text = None
     while i < file_size:
-        chunk_id = struct.unpack('<4s', content[i:i + 4])[0].decode('utf-8')
-        chunk_size = struct.unpack('<i', content[i + 4:i + 8])[0]
+        chunk_id = struct.unpack("<4s", content[i : i + 4])[0].decode("utf-8")
+        chunk_size = struct.unpack("<i", content[i + 4 : i + 8])[0]
-        if chunk_id == 'data':
+        if chunk_id == "data":
             samples = chunk_size / channels / (bits_per_sample / 8)
             break
-        if chunk_id == 'afsp':
-            chunks = struct.unpack(f'<{chunk_size}s', content[i + 8:i + 8 + chunk_size])[0]
-            chunks = chunks.decode('utf-8').split('\x00')
+        if chunk_id == "afsp":
+            chunks = struct.unpack(f"<{chunk_size}s", content[i + 8 : i + 8 + chunk_size])[0]
+            chunks = chunks.decode("utf-8").split("\x00")
             for chunk in chunks:
                 if chunk.startswith('text: "'):
-                    text = chunk[7:-1].lower().translate(str.maketrans('', '', string.punctuation))
+                    text = chunk[7:-1].lower().translate(str.maketrans("", "", string.punctuation))
         i += 8 + chunk_size + chunk_size % 2
     if text and samples:

sonusai/speech/textgrid.py CHANGED Viewed

@@ -6,61 +6,68 @@ from praatio.utilities.constants import Interval
 from .types import TimeAlignedType
-def create_textgrid(prompt: Path,
-                    output_dir: Path,
-                    text: TimeAlignedType = None,
-                    words: list[TimeAlignedType] = None,
-                    phonemes: list[TimeAlignedType] = None) -> None:
+def create_textgrid(
+    prompt: Path,
+    output_dir: Path,
+    text: TimeAlignedType | None = None,
+    words: list[TimeAlignedType] | None = None,
+    phonemes: list[TimeAlignedType] | None = None,
+) -> None:
     if text is None and words is None and phonemes is None:
         return
-    min_t, max_t = _get_min_max({'phonemes': phonemes,
-                                 'text':     [text],
-                                 'words':    words})
+    min_t, max_t = _get_min_max({"phonemes": phonemes, "text": text, "words": words})
     tg = textgrid.Textgrid()
     if text is not None:
         entries = [Interval(text.start, text.end, text.text)]
-        text_tier = textgrid.IntervalTier('text', entries, min_t, max_t)
+        text_tier = textgrid.IntervalTier("text", entries, min_t, max_t)
         tg.addTier(text_tier)
     if words is not None:
         entries = []
         for word in words:
             entries.append(Interval(word.start, word.end, word.text))
-        words_tier = textgrid.IntervalTier('words', entries, min_t, max_t)
+        words_tier = textgrid.IntervalTier("words", entries, min_t, max_t)
         tg.addTier(words_tier)
     if phonemes is not None:
         entries = []
         for phoneme in phonemes:
             entries.append(Interval(phoneme.start, phoneme.end, phoneme.text))
-        phonemes_tier = textgrid.IntervalTier('phonemes', entries, min_t, max_t)
+        phonemes_tier = textgrid.IntervalTier("phonemes", entries, min_t, max_t)
         tg.addTier(phonemes_tier)
-    output_filename = str(output_dir / prompt.stem) + '.TextGrid'
-    tg.save(output_filename, format='long_textgrid', includeBlankSpaces=True)
+    output_filename = str(output_dir / prompt.stem) + ".TextGrid"
+    tg.save(output_filename, format="long_textgrid", includeBlankSpaces=True)
-def _get_min_max(tiers: dict[str, list[TimeAlignedType]]) -> tuple[float, float]:
+def _get_min_max(tiers: dict[str, TimeAlignedType | list[TimeAlignedType] | None]) -> tuple[float, float]:
     starts = []
     ends = []
     for tier in tiers.values():
-        if tier is not None:
+        if tier is None:
+            continue
+        if isinstance(tier, TimeAlignedType):
+            starts.append(tier.start)
+            ends.append(tier.end)
+        else:
             starts.append(tier[0].start)
             ends.append(tier[-1].end)
     return min(starts), max(ends)
-def annotate_textgrid(tiers: dict[str, list[TimeAlignedType]], prompt: Path, output_dir: Path) -> None:
+def annotate_textgrid(
+    tiers: dict[str, TimeAlignedType | list[TimeAlignedType] | None] | None, prompt: Path, output_dir: Path
+) -> None:
     import os
     if tiers is None:
         return
-    file = Path(output_dir / prompt.stem).with_suffix('.TextGrid')
+    file = Path(output_dir / prompt.stem).with_suffix(".TextGrid")
     if not os.path.exists(file):
         tg = textgrid.Textgrid()
         min_t, max_t = _get_min_max(tiers)
@@ -69,14 +76,14 @@ def annotate_textgrid(tiers: dict[str, list[TimeAlignedType]], prompt: Path, out
         min_t = tg.minTimestamp
         max_t = tg.maxTimestamp
-    for tier in tiers.keys():
-        entries = []
-        for entry in tiers[tier]:
-            entries.append(Interval(entry.start, entry.end, entry.text))
-        if tier == 'phones':
-            name = 'annotation_phonemes'
+    for k, v in tiers.items():
+        if v is None:
+            continue
+        entries = [Interval(entry.start, entry.end, entry.text) for entry in v]
+        if k == "phones":
+            name = "annotation_phonemes"
         else:
-            name = 'annotation_' + tier
+            name = "annotation_" + k
         tg.addTier(textgrid.IntervalTier(name, entries, min_t, max_t))
-    tg.save(str(file), format='long_textgrid', includeBlankSpaces=True)
+    tg.save(str(file), format="long_textgrid", includeBlankSpaces=True)

sonusai/speech/timit.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 from pathlib import Path
-from typing import Optional
 from .types import TimeAlignedType
-def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
+def load_text(audio: str | os.PathLike[str]) -> TimeAlignedType | None:
     """Load time-aligned text data given a TIMIT audio file.
     :param audio: Path to the TIMIT audio file.
@@ -15,52 +14,52 @@ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
     from sonusai.mixture import get_sample_rate
-    file = Path(audio).with_suffix('.TXT')
+    file = Path(audio).with_suffix(".TXT")
     if not os.path.exists(file):
         return None
     sample_rate = get_sample_rate(str(audio))
-    with open(file, mode='r', encoding='utf-8') as f:
+    with open(file, encoding="utf-8") as f:
         line = f.read()
     fields = line.strip().split()
     start = int(fields[0]) / sample_rate
     end = int(fields[1]) / sample_rate
-    text = ' '.join(fields[2:]).lower().translate(str.maketrans('', '', string.punctuation))
+    text = " ".join(fields[2:]).lower().translate(str.maketrans("", "", string.punctuation))
     return TimeAlignedType(start, end, text)
-def load_words(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
+def load_words(audio: str | os.PathLike[str]) -> list[TimeAlignedType] | None:
     """Load time-aligned word data given a TIMIT audio file.
     :param audio: Path to the TIMIT audio file.
     :return: A list of TimeAlignedType objects.
     """
-    return _load_ta(audio, 'words')
+    return _load_ta(audio, "words")
-def load_phonemes(audio: str | os.PathLike[str]) -> Optional[list[TimeAlignedType]]:
+def load_phonemes(audio: str | os.PathLike[str]) -> list[TimeAlignedType] | None:
     """Load time-aligned phonemes data given a TIMIT audio file.
     :param audio: Path to the TIMIT audio file.
     :return: A list of TimeAlignedType objects.
     """
-    return _load_ta(audio, 'phonemes')
+    return _load_ta(audio, "phonemes")
-def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlignedType]]:
+def _load_ta(audio: str | os.PathLike[str], tier: str) -> list[TimeAlignedType] | None:
     from sonusai.mixture import get_sample_rate
-    if tier == 'words':
-        file = Path(audio).with_suffix('.WRD')
-    elif tier == 'phonemes':
-        file = Path(audio).with_suffix('.PHN')
+    if tier == "words":
+        file = Path(audio).with_suffix(".WRD")
+    elif tier == "phonemes":
+        file = Path(audio).with_suffix(".PHN")
     else:
-        raise ValueError(f'Unknown tier: {tier}')
+        raise ValueError(f"Unknown tier: {tier}")
     if not os.path.exists(file):
         return None
@@ -69,18 +68,18 @@ def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlig
     entries: list[TimeAlignedType] = []
     first = True
-    with open(file, mode='r', encoding='utf-8') as f:
+    with open(file, encoding="utf-8") as f:
         for line in f.readlines():
             fields = line.strip().split()
             start = int(fields[0]) / sample_rate
             end = int(fields[1]) / sample_rate
-            text = ' '.join(fields[2:])
+            text = " ".join(fields[2:])
             if first:
                 first = False
             else:
                 if start < entries[-1].end:
-                    start = entries[-1].end - (entries[- 1].end - start) // 2
+                    start = entries[-1].end - (entries[-1].end - start) // 2
                     entries[-1] = TimeAlignedType(text=entries[-1].text, start=entries[-1].start, end=start)
                 if end <= start:
@@ -93,43 +92,47 @@ def _load_ta(audio: str | os.PathLike[str], tier: str) -> Optional[list[TimeAlig
 def _years_between(record, born):
     try:
-        rec_fields = [int(x) for x in record.split('/')]
-        brn_fields = [int(x) for x in born.split('/')]
+        rec_fields = [int(x) for x in record.split("/")]
+        brn_fields = [int(x) for x in born.split("/")]
         return rec_fields[2] - brn_fields[2] - ((rec_fields[1], rec_fields[0]) < (brn_fields[1], brn_fields[0]))
     except ValueError:
-        return '??'
+        return "??"
 def _decode_dialect(d: str) -> str:
-    if d in ['DR1', '1']:
-        return 'New England'
-    if d in ['DR2', '2']:
-        return 'Northern'
-    if d in ['DR3', '3']:
-        return 'North Midland'
-    if d in ['DR4', '4']:
-        return 'South Midland'
-    if d in ['DR5', '5']:
-        return 'Southern'
-    if d in ['DR6', '6']:
-        return 'New York City'
-    if d in ['DR7', '7']:
-        return 'Western'
-    if d in ['DR8', '8']:
-        return 'Army Brat'
-    raise ValueError(f'Unrecognized dialect: {d}')
+    if d in ["DR1", "1"]:
+        return "New England"
+    if d in ["DR2", "2"]:
+        return "Northern"
+    if d in ["DR3", "3"]:
+        return "North Midland"
+    if d in ["DR4", "4"]:
+        return "South Midland"
+    if d in ["DR5", "5"]:
+        return "Southern"
+    if d in ["DR6", "6"]:
+        return "New York City"
+    if d in ["DR7", "7"]:
+        return "Western"
+    if d in ["DR8", "8"]:
+        return "Army Brat"
+    raise ValueError(f"Unrecognized dialect: {d}")
 def load_speakers(input_dir: Path) -> dict:
     speakers = {}
-    with open(input_dir / 'SPKRINFO.TXT') as file:
+    with open(input_dir / "SPKRINFO.TXT") as file:
         for line in file:
-            if not line.startswith(';'):
+            if not line.startswith(";"):
                 fields = line.strip().split()
                 speaker_id = fields[0]
                 gender = fields[1]
                 dialect = _decode_dialect(fields[2])
                 age = _years_between(fields[4], fields[5])
-                speakers[speaker_id] = {'gender': gender, 'dialect': dialect, 'age': age}
+                speakers[speaker_id] = {
+                    "gender": gender,
+                    "dialect": dialect,
+                    "age": age,
+                }
     return speakers

sonusai/speech/vctk.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 from pathlib import Path
-from typing import Optional
 from .types import TimeAlignedType
@@ -8,15 +7,13 @@ from .types import TimeAlignedType
 def _get_duration(name: str) -> float:
     import soundfile
-    from sonusai import SonusAIError
     try:
         return soundfile.info(name).duration
     except Exception as e:
-        raise SonusAIError(f'Error reading {name}: {e}')
+        raise OSError(f"Error reading {name}: {e}") from e
-def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
+def load_text(audio: str | os.PathLike[str]) -> TimeAlignedType | None:
     """Load time-aligned text data given a VCTK audio file.
     :param audio: Path to the VCTK audio file.
@@ -24,29 +21,33 @@ def load_text(audio: str | os.PathLike[str]) -> Optional[TimeAlignedType]:
     """
     import string
-    file = Path(audio).parents[2] / 'txt' / Path(audio).parent.name / (Path(audio).stem[:-5] + '.txt')
+    file = Path(audio).parents[2] / "txt" / Path(audio).parent.name / (Path(audio).stem[:-5] + ".txt")
     if not os.path.exists(file):
         return None
-    with open(file, mode='r', encoding='utf-8') as f:
+    with open(file, encoding="utf-8") as f:
         line = f.read()
     start = 0
     end = _get_duration(str(audio))
-    text = line.strip().lower().translate(str.maketrans('', '', string.punctuation))
+    text = line.strip().lower().translate(str.maketrans("", "", string.punctuation))
     return TimeAlignedType(start, end, text)
 def load_speakers(input_dir: Path) -> dict:
     speakers = {}
-    with open(input_dir / 'speaker-info.txt') as file:
+    with open(input_dir / "speaker-info.txt") as file:
         for line in file:
-            if not line.startswith('ID'):
-                fields = line.strip().split('(', 1)[0].split()
+            if not line.startswith("ID"):
+                fields = line.strip().split("(", 1)[0].split()
                 speaker_id = fields[0]
                 age = fields[1]
                 gender = fields[2]
-                dialect = ' '.join([field for field in fields[3:]])
-                speakers[speaker_id] = {'gender': gender, 'dialect': dialect, 'age': age}
+                dialect = " ".join(list(fields[3:]))
+                speakers[speaker_id] = {
+                    "gender": gender,
+                    "dialect": dialect,
+                    "age": age,
+                }
     return speakers

sonusai/speech/voxceleb.py CHANGED Viewed

@@ -19,26 +19,30 @@ def load_speakers(input_dir: Path) -> dict:
     # VoxCeleb1
     first = True
-    with open(input_dir / 'vox1_meta.csv', newline='') as file:
-        data = csv.reader(file, delimiter='\t')
+    with open(input_dir / "vox1_meta.csv", newline="") as file:
+        data = csv.reader(file, delimiter="\t")
         for row in data:
             if first:
                 first = False
             else:
-                speakers[row[0].strip()] = {'gender':   row[2].strip(),
-                                            'dialect':  row[3].strip(),
-                                            'category': row[4].strip()}
+                speakers[row[0].strip()] = {
+                    "gender": row[2].strip(),
+                    "dialect": row[3].strip(),
+                    "category": row[4].strip(),
+                }
     # VoxCeleb2
     first = True
-    with open(input_dir / 'vox2_meta.csv', newline='') as file:
-        data = csv.reader(file, delimiter='\t')
+    with open(input_dir / "vox2_meta.csv", newline="") as file:
+        data = csv.reader(file, delimiter="\t")
         for row in data:
             if first:
                 first = False
             else:
-                speakers[row[1].strip()] = {'gender':   row[3].strip(),
-                                            'category': row[4].strip()}
+                speakers[row[1].strip()] = {
+                    "gender": row[3].strip(),
+                    "category": row[4].strip(),
+                }
     return speakers
@@ -46,18 +50,20 @@ def load_speakers(input_dir: Path) -> dict:
 def load_segment(path: str | os.PathLike[str]) -> Segment:
     path = Path(path)
-    with path.open('r') as file:
+    with path.open("r") as file:
         segment = file.read().strip()
-    header, frames = segment.split('\n\n')
+    header, frames = segment.split("\n\n")
     header_dict = _parse_header(header)
     start, stop = _get_segment_boundaries(frames)
-    return Segment(person=header_dict['Identity'],
-                   video=header_dict['Reference'],
-                   id=path.stem,
-                   start=start,
-                   stop=stop)
+    return Segment(
+        person=header_dict["Identity"],
+        video=header_dict["Reference"],
+        id=path.stem,
+        start=start,
+        stop=stop,
+    )
 def _parse_header(header: str) -> dict:
@@ -73,12 +79,12 @@ def _parse_header(header: str) -> dict:
         ASD Conf  : \t4.465
         """
-        k, v = line.split('\t', maxsplit=1)
+        k, v = line.split("\t", maxsplit=1)
         k = k[:-2].strip()
         v = v.strip()
         return k, v
-    return dict(_parse_line(line) for line in header.split('\n'))
+    return dict(_parse_line(line) for line in header.split("\n"))
 def _get_segment_boundaries(frames: str) -> tuple[float, float]:
@@ -94,9 +100,9 @@ def _get_segment_boundaries(frames: str) -> tuple[float, float]:
     """
     def _get_frame_seconds(line: str) -> float:
-        frame = int(line.split('\t')[0])
+        frame = int(line.split("\t")[0])
         # YouTube is 25 FPS
         return frame / 25
-    lines = frames.split('\n')
+    lines = frames.split("\n")
     return _get_frame_seconds(lines[1]), _get_frame_seconds(lines[-1])

sonusai 0.18.9__py3-none-any.whl → 0.19.5__py3-none-any.whl

sonusai 0.18.9py3-none-any.whl → 0.19.5py3-none-any.whl