phoonnx 0.2.0a1__tar.gz → 0.2.1a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/PKG-INFO +1 -1
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/mwl.py +1 -1
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/version.py +1 -1
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx.egg-info/PKG-INFO +1 -1
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx.egg-info/SOURCES.txt +1 -1
- phoonnx-0.2.1a1/phoonnx_train/preprocess.py +597 -0
- phoonnx-0.2.1a1/phoonnx_train/train.py +151 -0
- phoonnx-0.2.0a1/phoonnx_train/__main__.py +0 -151
- phoonnx-0.2.0a1/phoonnx_train/preprocess.py +0 -447
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/README.md +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/config.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/locale/ca/phonetic_spellings.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/locale/en/phonetic_spellings.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/locale/gl/phonetic_spellings.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/locale/pt/phonetic_spellings.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phoneme_ids.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/ar.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/base.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/en.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/fa.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/gl.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/he.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/ja.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/ko.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/mul.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/vi.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/phonemizers/zh.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/arpa2ipa.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/bw2ipa.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/hangul2ipa.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/aspiration.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/assimilation.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/double_coda.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/hanja.tsv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/ipa.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/neutralization.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/tensification.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/yale.csv +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/kog2p/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/kog2p/rulebook.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/symbols.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/tokenization.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/num2words.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/araby.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/named_const.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/normalize.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/number.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/number_const.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/stack.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/trans.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/unicode_symbol2label.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/phonikud/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/LICENSE +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/SOURCE +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/hint_id_map.json +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/input_id_map.json +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/target_id_map.json +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/thirdparty/zh_num.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/util.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx/voice.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx.egg-info/dependency_links.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx.egg-info/requires.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx.egg-info/top_level.txt +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/export_onnx.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/norm_audio/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/norm_audio/trim.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/norm_audio/vad.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/attentions.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/commons.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/config.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/dataset.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/lightning.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/losses.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/mel_processing.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/models.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/modules.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/monotonic_align/__init__.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/monotonic_align/setup.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/transforms.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/utils.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/phoonnx_train/vits/wavfile.py +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/setup.cfg +0 -0
- {phoonnx-0.2.0a1 → phoonnx-0.2.1a1}/setup.py +0 -0
@@ -1,5 +1,4 @@
|
|
1
1
|
from phoonnx.phonemizers.base import BasePhonemizer, Alphabet
|
2
|
-
from mwl_phonemizer import CRFOrthoCorrector
|
3
2
|
|
4
3
|
|
5
4
|
class MirandesePhonemizer(BasePhonemizer):
|
@@ -7,6 +6,7 @@ class MirandesePhonemizer(BasePhonemizer):
|
|
7
6
|
|
8
7
|
def __init__(self):
|
9
8
|
super().__init__(Alphabet.IPA)
|
9
|
+
from mwl_phonemizer import CRFOrthoCorrector
|
10
10
|
self.pho = CRFOrthoCorrector()
|
11
11
|
|
12
12
|
@classmethod
|
@@ -69,9 +69,9 @@ phoonnx/thirdparty/tashkeel/hint_id_map.json
|
|
69
69
|
phoonnx/thirdparty/tashkeel/input_id_map.json
|
70
70
|
phoonnx/thirdparty/tashkeel/model.onnx
|
71
71
|
phoonnx/thirdparty/tashkeel/target_id_map.json
|
72
|
-
phoonnx_train/__main__.py
|
73
72
|
phoonnx_train/export_onnx.py
|
74
73
|
phoonnx_train/preprocess.py
|
74
|
+
phoonnx_train/train.py
|
75
75
|
phoonnx_train/../phoonnx/__init__.py
|
76
76
|
phoonnx_train/../phoonnx/config.py
|
77
77
|
phoonnx_train/../phoonnx/phoneme_ids.py
|
@@ -0,0 +1,597 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
import csv
|
3
|
+
import dataclasses
|
4
|
+
import itertools
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import os
|
8
|
+
from collections import Counter
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from multiprocessing import JoinableQueue, Process, Queue
|
11
|
+
from pathlib import Path
|
12
|
+
from typing import Dict, Iterable, List, Optional, Tuple, Any, Set, Union, Callable
|
13
|
+
|
14
|
+
import click
|
15
|
+
from phoonnx.util import normalize
|
16
|
+
from phoonnx.config import PhonemeType, get_phonemizer, Alphabet
|
17
|
+
from phoonnx.phonemizers import Phonemizer
|
18
|
+
from phoonnx.phoneme_ids import (
|
19
|
+
phonemes_to_ids, DEFAULT_IPA_PHONEME_ID_MAP, DEFAULT_PAD_TOKEN,
|
20
|
+
DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BLANK_WORD_TOKEN
|
21
|
+
)
|
22
|
+
from phoonnx_train.norm_audio import cache_norm_audio, make_silence_detector
|
23
|
+
from tqdm import tqdm
|
24
|
+
from phoonnx.version import VERSION_STR
|
25
|
+
|
26
|
+
_LOGGER = logging.getLogger("preprocess")
|
27
|
+
|
28
|
+
# Base phoneme map
|
29
|
+
DEFAULT_SPECIAL_PHONEME_ID_MAP: Dict[str, int] = {
|
30
|
+
DEFAULT_PAD_TOKEN: 0,
|
31
|
+
DEFAULT_BOS_TOKEN: 1,
|
32
|
+
DEFAULT_EOS_TOKEN: 2,
|
33
|
+
DEFAULT_BLANK_WORD_TOKEN: 3,
|
34
|
+
}
|
35
|
+
|
36
|
+
# -----------------------------------------------------------------------------
|
37
|
+
|
38
|
+
@dataclass
|
39
|
+
class Utterance:
|
40
|
+
"""Represents a single utterance in the dataset."""
|
41
|
+
text: str
|
42
|
+
audio_path: Path
|
43
|
+
speaker: Optional[str] = None
|
44
|
+
speaker_id: Optional[int] = None
|
45
|
+
phonemes: Optional[List[str]] = None
|
46
|
+
phoneme_ids: Optional[List[int]] = None
|
47
|
+
audio_norm_path: Optional[Path] = None
|
48
|
+
audio_spec_path: Optional[Path] = None
|
49
|
+
|
50
|
+
def asdict(self) -> Dict[str, Any]:
|
51
|
+
"""Custom asdict to handle Path objects for JSON serialization."""
|
52
|
+
data = dataclasses.asdict(self)
|
53
|
+
for key, value in data.items():
|
54
|
+
if isinstance(value, Path):
|
55
|
+
data[key] = str(value)
|
56
|
+
return data
|
57
|
+
|
58
|
+
|
59
|
+
class PathEncoder(json.JSONEncoder):
|
60
|
+
"""JSON encoder for Path objects."""
|
61
|
+
|
62
|
+
def default(self, o: Any) -> Union[str, Any]:
|
63
|
+
"""
|
64
|
+
Converts Path objects to strings for serialization.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
o: The object to serialize.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
The serialized string representation or the default JSON serialization.
|
71
|
+
"""
|
72
|
+
if isinstance(o, Path):
|
73
|
+
return str(o)
|
74
|
+
return super().default(o)
|
75
|
+
|
76
|
+
|
77
|
+
def get_text_casing(casing: str) -> Callable[[str], str]:
|
78
|
+
"""
|
79
|
+
Returns a function to apply text casing based on a string name.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
casing: The name of the casing function ('lower', 'upper', 'casefold', or 'ignore').
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
A callable function (str) -> str.
|
86
|
+
"""
|
87
|
+
if casing == "lower":
|
88
|
+
return str.lower
|
89
|
+
if casing == "upper":
|
90
|
+
return str.upper
|
91
|
+
if casing == "casefold":
|
92
|
+
return str.casefold
|
93
|
+
return lambda s: s
|
94
|
+
|
95
|
+
|
96
|
+
@dataclass
|
97
|
+
class PreprocessorConfig:
|
98
|
+
"""Dataclass to hold all runtime configuration, mimicking argparse.Namespace."""
|
99
|
+
input_dir: Path
|
100
|
+
output_dir: Path
|
101
|
+
language: str
|
102
|
+
sample_rate: int
|
103
|
+
cache_dir: Path
|
104
|
+
max_workers: int
|
105
|
+
single_speaker: bool
|
106
|
+
speaker_id: Optional[int]
|
107
|
+
phoneme_type: PhonemeType
|
108
|
+
alphabet: Alphabet
|
109
|
+
phonemizer_model: str
|
110
|
+
text_casing: str
|
111
|
+
dataset_name: Optional[str]
|
112
|
+
audio_quality: Optional[str]
|
113
|
+
skip_audio: bool
|
114
|
+
debug: bool
|
115
|
+
add_diacritics: bool
|
116
|
+
|
117
|
+
|
118
|
+
def ljspeech_dataset(config: PreprocessorConfig) -> Iterable[Utterance]:
|
119
|
+
"""
|
120
|
+
Generator for LJSpeech-style dataset.
|
121
|
+
Loads metadata and resolves audio file paths.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
config: The configuration object containing dataset parameters.
|
125
|
+
|
126
|
+
Yields:
|
127
|
+
Utterance: A fully populated Utterance object.
|
128
|
+
"""
|
129
|
+
dataset_dir = config.input_dir
|
130
|
+
metadata_path = dataset_dir / "metadata.csv"
|
131
|
+
if not metadata_path.exists():
|
132
|
+
_LOGGER.error(f"Missing metadata file: {metadata_path}")
|
133
|
+
return
|
134
|
+
|
135
|
+
wav_dirs: List[Path] = [dataset_dir / "wav", dataset_dir / "wavs"]
|
136
|
+
|
137
|
+
with open(metadata_path, "r", encoding="utf-8") as csv_file:
|
138
|
+
reader = csv.reader(csv_file, delimiter="|")
|
139
|
+
for row in reader:
|
140
|
+
if len(row) < 2:
|
141
|
+
_LOGGER.warning(f"Skipping malformed row: {row}")
|
142
|
+
continue
|
143
|
+
|
144
|
+
filename: str = row[0]
|
145
|
+
text: str = row[-1]
|
146
|
+
speaker: Optional[str] = None
|
147
|
+
|
148
|
+
if not config.single_speaker and len(row) > 2:
|
149
|
+
speaker = row[1]
|
150
|
+
else:
|
151
|
+
speaker = None
|
152
|
+
|
153
|
+
wav_path: Optional[Path] = None
|
154
|
+
for wav_dir in wav_dirs:
|
155
|
+
potential_paths: List[Path] = [
|
156
|
+
wav_dir / filename,
|
157
|
+
wav_dir / f"{filename}.wav",
|
158
|
+
wav_dir / f"{filename.lstrip('0')}.wav"
|
159
|
+
]
|
160
|
+
for path in potential_paths:
|
161
|
+
if path.exists():
|
162
|
+
wav_path = path
|
163
|
+
break
|
164
|
+
if wav_path:
|
165
|
+
break
|
166
|
+
|
167
|
+
if not config.skip_audio and not wav_path:
|
168
|
+
_LOGGER.warning("Missing audio file for filename: %s", filename)
|
169
|
+
continue
|
170
|
+
|
171
|
+
if not config.skip_audio and wav_path and wav_path.stat().st_size == 0:
|
172
|
+
_LOGGER.warning("Empty audio file: %s", wav_path)
|
173
|
+
continue
|
174
|
+
|
175
|
+
# Ensure wav_path is Path or None, and is never accessed if skip_audio is true
|
176
|
+
yield Utterance(
|
177
|
+
text=text,
|
178
|
+
audio_path=wav_path or Path(""), # Use empty path if skipping audio, should not be used
|
179
|
+
speaker=speaker,
|
180
|
+
speaker_id=config.speaker_id,
|
181
|
+
)
|
182
|
+
|
183
|
+
|
184
|
+
def phonemize_worker(
|
185
|
+
config: PreprocessorConfig,
|
186
|
+
task_queue: JoinableQueue,
|
187
|
+
result_queue: Queue,
|
188
|
+
phonemizer: Phonemizer,
|
189
|
+
) -> None:
|
190
|
+
"""
|
191
|
+
Worker process for phonemization and audio processing.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
config: The configuration object containing runtime parameters.
|
195
|
+
task_queue: Queue for receiving batches of Utterance objects.
|
196
|
+
result_queue: Queue for sending processed results (Utterance, set of phonemes).
|
197
|
+
phonemizer: The initialized Phonemizer instance.
|
198
|
+
"""
|
199
|
+
try:
|
200
|
+
casing: Callable[[str], str] = get_text_casing(config.text_casing)
|
201
|
+
silence_detector = make_silence_detector()
|
202
|
+
|
203
|
+
while True:
|
204
|
+
# Get a batch of utterances to process
|
205
|
+
utterance_batch: Union[List[Utterance], None] = task_queue.get()
|
206
|
+
if utterance_batch is None:
|
207
|
+
# Signal to exit
|
208
|
+
task_queue.task_done()
|
209
|
+
break
|
210
|
+
|
211
|
+
for utt in utterance_batch:
|
212
|
+
try:
|
213
|
+
# Normalize text (case, numbers, etc.)
|
214
|
+
utterance: str = casing(normalize(utt.text, config.language))
|
215
|
+
|
216
|
+
# Add diacritics
|
217
|
+
if config.add_diacritics:
|
218
|
+
utterance = phonemizer.add_diacritics(utterance, config.language)
|
219
|
+
|
220
|
+
# Phonemize the text
|
221
|
+
utt.phonemes = phonemizer.phonemize_to_list(utterance, config.language)
|
222
|
+
if not utt.phonemes:
|
223
|
+
raise RuntimeError(f"Phonemes not found for '{utterance}'")
|
224
|
+
|
225
|
+
# Process audio if not skipping
|
226
|
+
if not config.skip_audio:
|
227
|
+
utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
|
228
|
+
utt.audio_path,
|
229
|
+
config.cache_dir,
|
230
|
+
silence_detector,
|
231
|
+
config.sample_rate,
|
232
|
+
)
|
233
|
+
|
234
|
+
# Put the processed utterance and its phonemes into the result queue
|
235
|
+
# The result is a tuple of (Utterance, set of unique phonemes in that utterance)
|
236
|
+
result_queue.put((utt, set(utt.phonemes)))
|
237
|
+
except Exception:
|
238
|
+
_LOGGER.exception("Failed to process utterance: %s", utt.audio_path)
|
239
|
+
result_queue.put((None, set()))
|
240
|
+
|
241
|
+
task_queue.task_done()
|
242
|
+
|
243
|
+
except Exception:
|
244
|
+
_LOGGER.exception("Worker process failed")
|
245
|
+
|
246
|
+
|
247
|
+
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
248
|
+
@click.option(
|
249
|
+
"-i",
|
250
|
+
"--input-dir",
|
251
|
+
"input_dir",
|
252
|
+
type=click.Path(exists=True, file_okay=False, path_type=Path),
|
253
|
+
required=True,
|
254
|
+
help="Directory with audio dataset (e.g., containing metadata.csv and wavs/)",
|
255
|
+
)
|
256
|
+
@click.option(
|
257
|
+
"-o",
|
258
|
+
"--output-dir",
|
259
|
+
"output_dir",
|
260
|
+
type=click.Path(file_okay=False, path_type=Path),
|
261
|
+
required=True,
|
262
|
+
help="Directory to write output files for training (config.json, dataset.jsonl)",
|
263
|
+
)
|
264
|
+
@click.option(
|
265
|
+
"-l",
|
266
|
+
"--language",
|
267
|
+
"language",
|
268
|
+
required=True,
|
269
|
+
help="phonemizer language code (e.g., 'en', 'es', 'fr')",
|
270
|
+
)
|
271
|
+
@click.option(
|
272
|
+
"-r",
|
273
|
+
"--sample-rate",
|
274
|
+
"sample_rate",
|
275
|
+
type=int,
|
276
|
+
required=True,
|
277
|
+
help="Target sample rate for voice (hertz, e.g., 22050)",
|
278
|
+
)
|
279
|
+
@click.option(
|
280
|
+
"--cache-dir",
|
281
|
+
"cache_dir",
|
282
|
+
type=click.Path(file_okay=False, path_type=Path),
|
283
|
+
default=None,
|
284
|
+
help="Directory to cache processed audio files. Defaults to <output-dir>/cache/<sample-rate>.",
|
285
|
+
)
|
286
|
+
@click.option(
|
287
|
+
"-w",
|
288
|
+
"--max-workers",
|
289
|
+
"max_workers",
|
290
|
+
type=click.IntRange(min=1),
|
291
|
+
default=os.cpu_count() or 1,
|
292
|
+
help="Maximum number of worker processes to use for parallel processing. Defaults to CPU count.",
|
293
|
+
)
|
294
|
+
@click.option(
|
295
|
+
"--single-speaker",
|
296
|
+
"single_speaker",
|
297
|
+
is_flag=True,
|
298
|
+
help="Force treating the dataset as single speaker, ignoring metadata speaker columns.",
|
299
|
+
)
|
300
|
+
@click.option(
|
301
|
+
"--speaker-id",
|
302
|
+
"speaker_id",
|
303
|
+
type=int,
|
304
|
+
default=None,
|
305
|
+
help="Specify a fixed speaker ID (0, 1, etc.) for a single speaker dataset.",
|
306
|
+
)
|
307
|
+
@click.option(
|
308
|
+
"--phoneme-type",
|
309
|
+
"phoneme_type",
|
310
|
+
type=click.Choice([p.value for p in PhonemeType]),
|
311
|
+
default=PhonemeType.ESPEAK.value,
|
312
|
+
help="Type of phonemes to use.",
|
313
|
+
)
|
314
|
+
@click.option(
|
315
|
+
"--alphabet",
|
316
|
+
"alphabet",
|
317
|
+
type=click.Choice([a.value for a in Alphabet]),
|
318
|
+
default=Alphabet.IPA.value,
|
319
|
+
help="Phoneme alphabet to use (e.g., IPA).",
|
320
|
+
)
|
321
|
+
@click.option(
|
322
|
+
"--phonemizer-model",
|
323
|
+
"phonemizer_model",
|
324
|
+
default="",
|
325
|
+
help="Path or name of a custom phonemizer model, if applicable.",
|
326
|
+
)
|
327
|
+
@click.option(
|
328
|
+
"--text-casing",
|
329
|
+
"text_casing",
|
330
|
+
type=click.Choice(("ignore", "lower", "upper", "casefold")),
|
331
|
+
default="ignore",
|
332
|
+
help="Casing applied to utterance text before phonemization.",
|
333
|
+
)
|
334
|
+
@click.option(
|
335
|
+
"--dataset-name",
|
336
|
+
"dataset_name",
|
337
|
+
default=None,
|
338
|
+
help="Name of dataset to put in config (default: name of <output_dir>/../).",
|
339
|
+
)
|
340
|
+
@click.option(
|
341
|
+
"--audio-quality",
|
342
|
+
"audio_quality",
|
343
|
+
default=None,
|
344
|
+
help="Audio quality description to put in config (default: name of <output_dir>).",
|
345
|
+
)
|
346
|
+
@click.option(
|
347
|
+
"--skip-audio",
|
348
|
+
"skip_audio",
|
349
|
+
is_flag=True,
|
350
|
+
help="Do not preprocess or cache audio files.",
|
351
|
+
)
|
352
|
+
@click.option(
|
353
|
+
"--debug",
|
354
|
+
"debug",
|
355
|
+
is_flag=True,
|
356
|
+
help="Print DEBUG messages to the console.",
|
357
|
+
)
|
358
|
+
@click.option(
|
359
|
+
"--add-diacritics",
|
360
|
+
"add_diacritics",
|
361
|
+
is_flag=True,
|
362
|
+
help="Add diacritics to text (phonemizer specific, e.g., to denote stress).",
|
363
|
+
)
|
364
|
+
def cli(
|
365
|
+
input_dir: Path,
|
366
|
+
output_dir: Path,
|
367
|
+
language: str,
|
368
|
+
sample_rate: int,
|
369
|
+
cache_dir: Optional[Path],
|
370
|
+
max_workers: Optional[int],
|
371
|
+
single_speaker: bool,
|
372
|
+
speaker_id: Optional[int],
|
373
|
+
phoneme_type: str,
|
374
|
+
alphabet: str,
|
375
|
+
phonemizer_model: str,
|
376
|
+
text_casing: str,
|
377
|
+
dataset_name: Optional[str],
|
378
|
+
audio_quality: Optional[str],
|
379
|
+
skip_audio: bool,
|
380
|
+
debug: bool,
|
381
|
+
add_diacritics: bool,
|
382
|
+
) -> None:
|
383
|
+
"""
|
384
|
+
Preprocess a TTS dataset (e.g., LJSpeech format) for training a VITS-style model.
|
385
|
+
This script handles text normalization, phonemization, and optional audio caching.
|
386
|
+
"""
|
387
|
+
# Create a config object from click arguments for easier passing
|
388
|
+
config = PreprocessorConfig(
|
389
|
+
input_dir=input_dir,
|
390
|
+
output_dir=output_dir,
|
391
|
+
language=language,
|
392
|
+
sample_rate=sample_rate,
|
393
|
+
cache_dir=cache_dir or output_dir / "cache" / str(sample_rate),
|
394
|
+
max_workers=max_workers or os.cpu_count() or 1,
|
395
|
+
single_speaker=single_speaker,
|
396
|
+
speaker_id=speaker_id,
|
397
|
+
phoneme_type=PhonemeType(phoneme_type),
|
398
|
+
alphabet=Alphabet(alphabet),
|
399
|
+
phonemizer_model=phonemizer_model,
|
400
|
+
text_casing=text_casing,
|
401
|
+
dataset_name=dataset_name,
|
402
|
+
audio_quality=audio_quality,
|
403
|
+
skip_audio=skip_audio,
|
404
|
+
debug=debug,
|
405
|
+
add_diacritics=add_diacritics,
|
406
|
+
)
|
407
|
+
|
408
|
+
# Setup logging
|
409
|
+
level = logging.DEBUG if config.debug else logging.INFO
|
410
|
+
logging.basicConfig(level=level)
|
411
|
+
logging.getLogger().setLevel(level)
|
412
|
+
logging.getLogger("numba").setLevel(logging.WARNING)
|
413
|
+
|
414
|
+
# Validation
|
415
|
+
if config.single_speaker and (config.speaker_id is not None):
|
416
|
+
_LOGGER.fatal("--single-speaker and --speaker-id cannot both be provided")
|
417
|
+
raise click.Abort()
|
418
|
+
|
419
|
+
# Create directories
|
420
|
+
config.output_dir.mkdir(parents=True, exist_ok=True)
|
421
|
+
config.cache_dir.mkdir(parents=True, exist_ok=True)
|
422
|
+
|
423
|
+
# Load all utterances from the dataset
|
424
|
+
_LOGGER.info("Loading utterances from dataset...")
|
425
|
+
utterances: List[Utterance] = list(ljspeech_dataset(config))
|
426
|
+
if not utterances:
|
427
|
+
_LOGGER.error("No valid utterances found in dataset.")
|
428
|
+
return
|
429
|
+
|
430
|
+
num_utterances: int = len(utterances)
|
431
|
+
_LOGGER.info("Found %d utterances.", num_utterances)
|
432
|
+
|
433
|
+
# Count speakers and assign IDs
|
434
|
+
speaker_counts: Counter[str] = Counter(u.speaker for u in utterances if u.speaker)
|
435
|
+
is_multispeaker: bool = len(speaker_counts) > 1
|
436
|
+
speaker_ids: Dict[str, int] = {}
|
437
|
+
if is_multispeaker:
|
438
|
+
_LOGGER.info("%s speakers detected", len(speaker_counts))
|
439
|
+
# Assign speaker ids by most number of utterances first
|
440
|
+
for speaker_id, (speaker, _) in enumerate(speaker_counts.most_common()):
|
441
|
+
speaker_ids[speaker] = speaker_id
|
442
|
+
else:
|
443
|
+
_LOGGER.info("Single speaker dataset")
|
444
|
+
|
445
|
+
# --- Single Pass: Process audio/phonemes and collect results ---
|
446
|
+
_LOGGER.info("Starting single pass processing with %d workers...", config.max_workers)
|
447
|
+
|
448
|
+
# Initialize the phonemizer only once in the main process
|
449
|
+
phonemizer: Phonemizer = get_phonemizer(config.phoneme_type,
|
450
|
+
config.alphabet,
|
451
|
+
config.phonemizer_model)
|
452
|
+
|
453
|
+
batch_size: int = max(1, int(num_utterances / (config.max_workers * 2)))
|
454
|
+
|
455
|
+
task_queue: "JoinableQueue[Optional[List[Utterance]]]" = JoinableQueue()
|
456
|
+
# The result queue will hold tuples of (Utterance, set(phonemes))
|
457
|
+
result_queue: "Queue[Tuple[Optional[Utterance], Set[str]]]" = Queue()
|
458
|
+
|
459
|
+
# Start workers
|
460
|
+
processes: List[Process] = [
|
461
|
+
Process(
|
462
|
+
target=phonemize_worker,
|
463
|
+
args=(config, task_queue, result_queue, phonemizer)
|
464
|
+
)
|
465
|
+
for _ in range(config.max_workers)
|
466
|
+
]
|
467
|
+
|
468
|
+
for proc in processes:
|
469
|
+
proc.start()
|
470
|
+
|
471
|
+
# Populate the task queue with batches
|
472
|
+
task_count: int = 0
|
473
|
+
for utt_batch in batched(utterances, batch_size):
|
474
|
+
task_queue.put(utt_batch)
|
475
|
+
task_count += len(utt_batch)
|
476
|
+
|
477
|
+
# Signal workers to stop
|
478
|
+
for _ in range(config.max_workers):
|
479
|
+
task_queue.put(None)
|
480
|
+
|
481
|
+
# Collect results from the queue with a progress bar
|
482
|
+
processed_utterances: List[Utterance] = []
|
483
|
+
all_phonemes: Set[str] = set()
|
484
|
+
for _ in tqdm(range(task_count), desc="Processing utterances"):
|
485
|
+
result: Tuple[Optional[Utterance], Set[str]] = result_queue.get()
|
486
|
+
utt, unique_phonemes = result
|
487
|
+
if utt is not None:
|
488
|
+
processed_utterances.append(utt)
|
489
|
+
all_phonemes.update(unique_phonemes)
|
490
|
+
|
491
|
+
# Wait for workers to finish
|
492
|
+
task_queue.join()
|
493
|
+
for proc in processes:
|
494
|
+
proc.join()
|
495
|
+
|
496
|
+
# --- Build the final phoneme map from the collected phonemes ---
|
497
|
+
_LOGGER.info("Building a complete phoneme map from collected phonemes...")
|
498
|
+
|
499
|
+
final_phoneme_id_map: Dict[str, int] = DEFAULT_SPECIAL_PHONEME_ID_MAP.copy()
|
500
|
+
if phonemizer.alphabet == Alphabet.IPA:
|
501
|
+
all_phonemes.update(DEFAULT_IPA_PHONEME_ID_MAP.keys())
|
502
|
+
|
503
|
+
# Filter out special tokens that are already in the map
|
504
|
+
existing_keys: Set[str] = set(final_phoneme_id_map.keys())
|
505
|
+
new_phonemes: List[str] = sorted([p for p in all_phonemes if p not in existing_keys])
|
506
|
+
|
507
|
+
current_id: int = len(final_phoneme_id_map)
|
508
|
+
for pho in new_phonemes:
|
509
|
+
final_phoneme_id_map[pho] = current_id
|
510
|
+
current_id += 1
|
511
|
+
|
512
|
+
_LOGGER.info("Final phoneme map contains %d symbols.", len(final_phoneme_id_map))
|
513
|
+
|
514
|
+
# --- Write the final config.json ---
|
515
|
+
_LOGGER.info("Writing dataset config...")
|
516
|
+
audio_quality = config.audio_quality or config.output_dir.name
|
517
|
+
dataset_name = config.dataset_name or config.output_dir.parent.name
|
518
|
+
|
519
|
+
config_data: Dict[str, Any] = {
|
520
|
+
"dataset": dataset_name,
|
521
|
+
"audio": {
|
522
|
+
"sample_rate": config.sample_rate,
|
523
|
+
"quality": audio_quality,
|
524
|
+
},
|
525
|
+
"lang_code": config.language,
|
526
|
+
"inference": {"noise_scale": 0.667,
|
527
|
+
"length_scale": 1,
|
528
|
+
"noise_w": 0.8,
|
529
|
+
"add_diacritics": config.add_diacritics},
|
530
|
+
"alphabet": phonemizer.alphabet.value,
|
531
|
+
"phoneme_type": config.phoneme_type.value,
|
532
|
+
"phonemizer_model": config.phonemizer_model,
|
533
|
+
"phoneme_id_map": final_phoneme_id_map,
|
534
|
+
"num_symbols": len(final_phoneme_id_map),
|
535
|
+
"num_speakers": len(speaker_counts) if is_multispeaker else 1,
|
536
|
+
"speaker_id_map": speaker_ids,
|
537
|
+
"phoonnx_version": VERSION_STR,
|
538
|
+
}
|
539
|
+
|
540
|
+
with open(config.output_dir / "config.json", "w", encoding="utf-8") as config_file:
|
541
|
+
json.dump(config_data, config_file, ensure_ascii=False, indent=2)
|
542
|
+
|
543
|
+
# --- Apply final phoneme IDs and write dataset.jsonl ---
|
544
|
+
_LOGGER.info("Writing dataset.jsonl...")
|
545
|
+
valid_utterances_count: int = 0
|
546
|
+
with open(config.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
547
|
+
for utt in processed_utterances:
|
548
|
+
if is_multispeaker and utt.speaker is not None:
|
549
|
+
if utt.speaker not in speaker_ids:
|
550
|
+
_LOGGER.error("Speaker '%s' not in speaker_id_map. This indicates an issue with your metadata.csv file.", utt.speaker)
|
551
|
+
continue
|
552
|
+
utt.speaker_id = speaker_ids[utt.speaker]
|
553
|
+
|
554
|
+
# Apply the final phoneme ID map to each utterance
|
555
|
+
if utt.phonemes:
|
556
|
+
utt.phoneme_ids = phonemes_to_ids(utt.phonemes, id_map=final_phoneme_id_map)
|
557
|
+
|
558
|
+
if not utt.phoneme_ids:
|
559
|
+
_LOGGER.warning("Skipping utterance with invalid phoneme_ids before writing: %s", utt.audio_path)
|
560
|
+
continue
|
561
|
+
|
562
|
+
json.dump(
|
563
|
+
utt.asdict(),
|
564
|
+
dataset_file,
|
565
|
+
ensure_ascii=False,
|
566
|
+
cls=PathEncoder,
|
567
|
+
)
|
568
|
+
print("", file=dataset_file)
|
569
|
+
valid_utterances_count += 1
|
570
|
+
|
571
|
+
_LOGGER.info("Preprocessing complete. Wrote %d valid utterances to dataset.jsonl.", valid_utterances_count)
|
572
|
+
|
573
|
+
|
574
|
+
# -----------------------------------------------------------------------------
|
575
|
+
|
576
|
+
def batched(iterable: Iterable[Any], n: int) -> Iterable[List[Any]]:
|
577
|
+
"""
|
578
|
+
Batch data from an iterable into lists of length n. The last batch may be shorter.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
iterable: The input iterable to be batched.
|
582
|
+
n: The desired size of each batch.
|
583
|
+
|
584
|
+
Yields:
|
585
|
+
List[Any]: A list representing a batch of items.
|
586
|
+
"""
|
587
|
+
if n < 1:
|
588
|
+
raise ValueError("n must be at least one")
|
589
|
+
it = iter(iterable)
|
590
|
+
batch = list(itertools.islice(it, n))
|
591
|
+
while batch:
|
592
|
+
yield batch
|
593
|
+
batch = list(itertools.islice(it, n))
|
594
|
+
|
595
|
+
|
596
|
+
if __name__ == "__main__":
|
597
|
+
cli()
|