phoonnx 0.2.0a2__tar.gz → 0.2.1a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/PKG-INFO +1 -1
  2. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/version.py +2 -2
  3. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx.egg-info/PKG-INFO +1 -1
  4. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx.egg-info/SOURCES.txt +1 -1
  5. phoonnx-0.2.1a1/phoonnx_train/preprocess.py +597 -0
  6. phoonnx-0.2.1a1/phoonnx_train/train.py +151 -0
  7. phoonnx-0.2.0a2/phoonnx_train/__main__.py +0 -151
  8. phoonnx-0.2.0a2/phoonnx_train/preprocess.py +0 -447
  9. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/README.md +0 -0
  10. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/__init__.py +0 -0
  11. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/config.py +0 -0
  12. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/locale/ca/phonetic_spellings.txt +0 -0
  13. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/locale/en/phonetic_spellings.txt +0 -0
  14. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/locale/gl/phonetic_spellings.txt +0 -0
  15. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/locale/pt/phonetic_spellings.txt +0 -0
  16. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phoneme_ids.py +0 -0
  17. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/__init__.py +0 -0
  18. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/ar.py +0 -0
  19. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/base.py +0 -0
  20. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/en.py +0 -0
  21. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/fa.py +0 -0
  22. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/gl.py +0 -0
  23. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/he.py +0 -0
  24. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/ja.py +0 -0
  25. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/ko.py +0 -0
  26. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/mul.py +0 -0
  27. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/mwl.py +0 -0
  28. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/vi.py +0 -0
  29. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/phonemizers/zh.py +0 -0
  30. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/__init__.py +0 -0
  31. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/arpa2ipa.py +0 -0
  32. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/bw2ipa.py +0 -0
  33. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  34. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  35. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/hangul2ipa.py +0 -0
  36. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/aspiration.csv +0 -0
  37. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/assimilation.csv +0 -0
  38. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/double_coda.csv +0 -0
  39. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/hanja.tsv +0 -0
  40. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/ipa.csv +0 -0
  41. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/neutralization.csv +0 -0
  42. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/tensification.csv +0 -0
  43. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/ko_tables/yale.csv +0 -0
  44. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/kog2p/__init__.py +0 -0
  45. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/kog2p/rulebook.txt +0 -0
  46. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/__init__.py +0 -0
  47. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  48. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +0 -0
  49. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/symbols.py +0 -0
  50. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/buck/tokenization.py +0 -0
  51. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/num2words.py +0 -0
  52. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/__init__.py +0 -0
  53. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +0 -0
  54. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/araby.py +0 -0
  55. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/named_const.py +0 -0
  56. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/normalize.py +0 -0
  57. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/number.py +0 -0
  58. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/number_const.py +0 -0
  59. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/stack.py +0 -0
  60. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/pyarabic/trans.py +0 -0
  61. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/mantoq/unicode_symbol2label.py +0 -0
  62. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/phonikud/__init__.py +0 -0
  63. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/LICENSE +0 -0
  64. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/SOURCE +0 -0
  65. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/__init__.py +0 -0
  66. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/hint_id_map.json +0 -0
  67. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/input_id_map.json +0 -0
  68. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  69. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/tashkeel/target_id_map.json +0 -0
  70. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/thirdparty/zh_num.py +0 -0
  71. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/util.py +0 -0
  72. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx/voice.py +0 -0
  73. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx.egg-info/dependency_links.txt +0 -0
  74. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx.egg-info/requires.txt +0 -0
  75. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx.egg-info/top_level.txt +0 -0
  76. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/export_onnx.py +0 -0
  77. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/norm_audio/__init__.py +0 -0
  78. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/norm_audio/trim.py +0 -0
  79. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/norm_audio/vad.py +0 -0
  80. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/__init__.py +0 -0
  81. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/attentions.py +0 -0
  82. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/commons.py +0 -0
  83. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/config.py +0 -0
  84. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/dataset.py +0 -0
  85. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/lightning.py +0 -0
  86. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/losses.py +0 -0
  87. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/mel_processing.py +0 -0
  88. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/models.py +0 -0
  89. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/modules.py +0 -0
  90. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/monotonic_align/__init__.py +0 -0
  91. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/monotonic_align/setup.py +0 -0
  92. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/transforms.py +0 -0
  93. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/utils.py +0 -0
  94. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/phoonnx_train/vits/wavfile.py +0 -0
  95. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/setup.cfg +0 -0
  96. {phoonnx-0.2.0a2 → phoonnx-0.2.1a1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phoonnx
3
- Version: 0.2.0a2
3
+ Version: 0.2.1a1
4
4
  Home-page: https://github.com/TigreGotico/phoonnx
5
5
  Author: JarbasAi
6
6
  Author-email: jarbasai@mailfence.com
@@ -1,8 +1,8 @@
1
1
  # START_VERSION_BLOCK
2
2
  VERSION_MAJOR = 0
3
3
  VERSION_MINOR = 2
4
- VERSION_BUILD = 0
5
- VERSION_ALPHA = 2
4
+ VERSION_BUILD = 1
5
+ VERSION_ALPHA = 1
6
6
  # END_VERSION_BLOCK
7
7
 
8
8
  VERSION_STR = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_BUILD}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phoonnx
3
- Version: 0.2.0a2
3
+ Version: 0.2.1a1
4
4
  Home-page: https://github.com/TigreGotico/phoonnx
5
5
  Author: JarbasAi
6
6
  Author-email: jarbasai@mailfence.com
@@ -69,9 +69,9 @@ phoonnx/thirdparty/tashkeel/hint_id_map.json
69
69
  phoonnx/thirdparty/tashkeel/input_id_map.json
70
70
  phoonnx/thirdparty/tashkeel/model.onnx
71
71
  phoonnx/thirdparty/tashkeel/target_id_map.json
72
- phoonnx_train/__main__.py
73
72
  phoonnx_train/export_onnx.py
74
73
  phoonnx_train/preprocess.py
74
+ phoonnx_train/train.py
75
75
  phoonnx_train/../phoonnx/__init__.py
76
76
  phoonnx_train/../phoonnx/config.py
77
77
  phoonnx_train/../phoonnx/phoneme_ids.py
@@ -0,0 +1,597 @@
1
+ #!/usr/bin/env python3
2
+ import csv
3
+ import dataclasses
4
+ import itertools
5
+ import json
6
+ import logging
7
+ import os
8
+ from collections import Counter
9
+ from dataclasses import dataclass
10
+ from multiprocessing import JoinableQueue, Process, Queue
11
+ from pathlib import Path
12
+ from typing import Dict, Iterable, List, Optional, Tuple, Any, Set, Union, Callable
13
+
14
+ import click
15
+ from phoonnx.util import normalize
16
+ from phoonnx.config import PhonemeType, get_phonemizer, Alphabet
17
+ from phoonnx.phonemizers import Phonemizer
18
+ from phoonnx.phoneme_ids import (
19
+ phonemes_to_ids, DEFAULT_IPA_PHONEME_ID_MAP, DEFAULT_PAD_TOKEN,
20
+ DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BLANK_WORD_TOKEN
21
+ )
22
+ from phoonnx_train.norm_audio import cache_norm_audio, make_silence_detector
23
+ from tqdm import tqdm
24
+ from phoonnx.version import VERSION_STR
25
+
26
+ _LOGGER = logging.getLogger("preprocess")
27
+
28
+ # Base phoneme map
29
+ DEFAULT_SPECIAL_PHONEME_ID_MAP: Dict[str, int] = {
30
+ DEFAULT_PAD_TOKEN: 0,
31
+ DEFAULT_BOS_TOKEN: 1,
32
+ DEFAULT_EOS_TOKEN: 2,
33
+ DEFAULT_BLANK_WORD_TOKEN: 3,
34
+ }
35
+
36
+ # -----------------------------------------------------------------------------
37
+
38
+ @dataclass
39
+ class Utterance:
40
+ """Represents a single utterance in the dataset."""
41
+ text: str
42
+ audio_path: Path
43
+ speaker: Optional[str] = None
44
+ speaker_id: Optional[int] = None
45
+ phonemes: Optional[List[str]] = None
46
+ phoneme_ids: Optional[List[int]] = None
47
+ audio_norm_path: Optional[Path] = None
48
+ audio_spec_path: Optional[Path] = None
49
+
50
+ def asdict(self) -> Dict[str, Any]:
51
+ """Custom asdict to handle Path objects for JSON serialization."""
52
+ data = dataclasses.asdict(self)
53
+ for key, value in data.items():
54
+ if isinstance(value, Path):
55
+ data[key] = str(value)
56
+ return data
57
+
58
+
59
+ class PathEncoder(json.JSONEncoder):
60
+ """JSON encoder for Path objects."""
61
+
62
+ def default(self, o: Any) -> Union[str, Any]:
63
+ """
64
+ Converts Path objects to strings for serialization.
65
+
66
+ Args:
67
+ o: The object to serialize.
68
+
69
+ Returns:
70
+ The serialized string representation or the default JSON serialization.
71
+ """
72
+ if isinstance(o, Path):
73
+ return str(o)
74
+ return super().default(o)
75
+
76
+
77
+ def get_text_casing(casing: str) -> Callable[[str], str]:
78
+ """
79
+ Returns a function to apply text casing based on a string name.
80
+
81
+ Args:
82
+ casing: The name of the casing function ('lower', 'upper', 'casefold', or 'ignore').
83
+
84
+ Returns:
85
+ A callable function (str) -> str.
86
+ """
87
+ if casing == "lower":
88
+ return str.lower
89
+ if casing == "upper":
90
+ return str.upper
91
+ if casing == "casefold":
92
+ return str.casefold
93
+ return lambda s: s
94
+
95
+
96
+ @dataclass
97
+ class PreprocessorConfig:
98
+ """Dataclass to hold all runtime configuration, mimicking argparse.Namespace."""
99
+ input_dir: Path
100
+ output_dir: Path
101
+ language: str
102
+ sample_rate: int
103
+ cache_dir: Path
104
+ max_workers: int
105
+ single_speaker: bool
106
+ speaker_id: Optional[int]
107
+ phoneme_type: PhonemeType
108
+ alphabet: Alphabet
109
+ phonemizer_model: str
110
+ text_casing: str
111
+ dataset_name: Optional[str]
112
+ audio_quality: Optional[str]
113
+ skip_audio: bool
114
+ debug: bool
115
+ add_diacritics: bool
116
+
117
+
118
+ def ljspeech_dataset(config: PreprocessorConfig) -> Iterable[Utterance]:
119
+ """
120
+ Generator for LJSpeech-style dataset.
121
+ Loads metadata and resolves audio file paths.
122
+
123
+ Args:
124
+ config: The configuration object containing dataset parameters.
125
+
126
+ Yields:
127
+ Utterance: A fully populated Utterance object.
128
+ """
129
+ dataset_dir = config.input_dir
130
+ metadata_path = dataset_dir / "metadata.csv"
131
+ if not metadata_path.exists():
132
+ _LOGGER.error(f"Missing metadata file: {metadata_path}")
133
+ return
134
+
135
+ wav_dirs: List[Path] = [dataset_dir / "wav", dataset_dir / "wavs"]
136
+
137
+ with open(metadata_path, "r", encoding="utf-8") as csv_file:
138
+ reader = csv.reader(csv_file, delimiter="|")
139
+ for row in reader:
140
+ if len(row) < 2:
141
+ _LOGGER.warning(f"Skipping malformed row: {row}")
142
+ continue
143
+
144
+ filename: str = row[0]
145
+ text: str = row[-1]
146
+ speaker: Optional[str] = None
147
+
148
+ if not config.single_speaker and len(row) > 2:
149
+ speaker = row[1]
150
+ else:
151
+ speaker = None
152
+
153
+ wav_path: Optional[Path] = None
154
+ for wav_dir in wav_dirs:
155
+ potential_paths: List[Path] = [
156
+ wav_dir / filename,
157
+ wav_dir / f"{filename}.wav",
158
+ wav_dir / f"{filename.lstrip('0')}.wav"
159
+ ]
160
+ for path in potential_paths:
161
+ if path.exists():
162
+ wav_path = path
163
+ break
164
+ if wav_path:
165
+ break
166
+
167
+ if not config.skip_audio and not wav_path:
168
+ _LOGGER.warning("Missing audio file for filename: %s", filename)
169
+ continue
170
+
171
+ if not config.skip_audio and wav_path and wav_path.stat().st_size == 0:
172
+ _LOGGER.warning("Empty audio file: %s", wav_path)
173
+ continue
174
+
175
+ # Ensure wav_path is Path or None, and is never accessed if skip_audio is true
176
+ yield Utterance(
177
+ text=text,
178
+ audio_path=wav_path or Path(""), # Use empty path if skipping audio, should not be used
179
+ speaker=speaker,
180
+ speaker_id=config.speaker_id,
181
+ )
182
+
183
+
184
+ def phonemize_worker(
185
+ config: PreprocessorConfig,
186
+ task_queue: JoinableQueue,
187
+ result_queue: Queue,
188
+ phonemizer: Phonemizer,
189
+ ) -> None:
190
+ """
191
+ Worker process for phonemization and audio processing.
192
+
193
+ Args:
194
+ config: The configuration object containing runtime parameters.
195
+ task_queue: Queue for receiving batches of Utterance objects.
196
+ result_queue: Queue for sending processed results (Utterance, set of phonemes).
197
+ phonemizer: The initialized Phonemizer instance.
198
+ """
199
+ try:
200
+ casing: Callable[[str], str] = get_text_casing(config.text_casing)
201
+ silence_detector = make_silence_detector()
202
+
203
+ while True:
204
+ # Get a batch of utterances to process
205
+ utterance_batch: Union[List[Utterance], None] = task_queue.get()
206
+ if utterance_batch is None:
207
+ # Signal to exit
208
+ task_queue.task_done()
209
+ break
210
+
211
+ for utt in utterance_batch:
212
+ try:
213
+ # Normalize text (case, numbers, etc.)
214
+ utterance: str = casing(normalize(utt.text, config.language))
215
+
216
+ # Add diacritics
217
+ if config.add_diacritics:
218
+ utterance = phonemizer.add_diacritics(utterance, config.language)
219
+
220
+ # Phonemize the text
221
+ utt.phonemes = phonemizer.phonemize_to_list(utterance, config.language)
222
+ if not utt.phonemes:
223
+ raise RuntimeError(f"Phonemes not found for '{utterance}'")
224
+
225
+ # Process audio if not skipping
226
+ if not config.skip_audio:
227
+ utt.audio_norm_path, utt.audio_spec_path = cache_norm_audio(
228
+ utt.audio_path,
229
+ config.cache_dir,
230
+ silence_detector,
231
+ config.sample_rate,
232
+ )
233
+
234
+ # Put the processed utterance and its phonemes into the result queue
235
+ # The result is a tuple of (Utterance, set of unique phonemes in that utterance)
236
+ result_queue.put((utt, set(utt.phonemes)))
237
+ except Exception:
238
+ _LOGGER.exception("Failed to process utterance: %s", utt.audio_path)
239
+ result_queue.put((None, set()))
240
+
241
+ task_queue.task_done()
242
+
243
+ except Exception:
244
+ _LOGGER.exception("Worker process failed")
245
+
246
+
247
+ @click.command(context_settings={"help_option_names": ["-h", "--help"]})
248
+ @click.option(
249
+ "-i",
250
+ "--input-dir",
251
+ "input_dir",
252
+ type=click.Path(exists=True, file_okay=False, path_type=Path),
253
+ required=True,
254
+ help="Directory with audio dataset (e.g., containing metadata.csv and wavs/)",
255
+ )
256
+ @click.option(
257
+ "-o",
258
+ "--output-dir",
259
+ "output_dir",
260
+ type=click.Path(file_okay=False, path_type=Path),
261
+ required=True,
262
+ help="Directory to write output files for training (config.json, dataset.jsonl)",
263
+ )
264
+ @click.option(
265
+ "-l",
266
+ "--language",
267
+ "language",
268
+ required=True,
269
+ help="phonemizer language code (e.g., 'en', 'es', 'fr')",
270
+ )
271
+ @click.option(
272
+ "-r",
273
+ "--sample-rate",
274
+ "sample_rate",
275
+ type=int,
276
+ required=True,
277
+ help="Target sample rate for voice (hertz, e.g., 22050)",
278
+ )
279
+ @click.option(
280
+ "--cache-dir",
281
+ "cache_dir",
282
+ type=click.Path(file_okay=False, path_type=Path),
283
+ default=None,
284
+ help="Directory to cache processed audio files. Defaults to <output-dir>/cache/<sample-rate>.",
285
+ )
286
+ @click.option(
287
+ "-w",
288
+ "--max-workers",
289
+ "max_workers",
290
+ type=click.IntRange(min=1),
291
+ default=os.cpu_count() or 1,
292
+ help="Maximum number of worker processes to use for parallel processing. Defaults to CPU count.",
293
+ )
294
+ @click.option(
295
+ "--single-speaker",
296
+ "single_speaker",
297
+ is_flag=True,
298
+ help="Force treating the dataset as single speaker, ignoring metadata speaker columns.",
299
+ )
300
+ @click.option(
301
+ "--speaker-id",
302
+ "speaker_id",
303
+ type=int,
304
+ default=None,
305
+ help="Specify a fixed speaker ID (0, 1, etc.) for a single speaker dataset.",
306
+ )
307
+ @click.option(
308
+ "--phoneme-type",
309
+ "phoneme_type",
310
+ type=click.Choice([p.value for p in PhonemeType]),
311
+ default=PhonemeType.ESPEAK.value,
312
+ help="Type of phonemes to use.",
313
+ )
314
+ @click.option(
315
+ "--alphabet",
316
+ "alphabet",
317
+ type=click.Choice([a.value for a in Alphabet]),
318
+ default=Alphabet.IPA.value,
319
+ help="Phoneme alphabet to use (e.g., IPA).",
320
+ )
321
+ @click.option(
322
+ "--phonemizer-model",
323
+ "phonemizer_model",
324
+ default="",
325
+ help="Path or name of a custom phonemizer model, if applicable.",
326
+ )
327
+ @click.option(
328
+ "--text-casing",
329
+ "text_casing",
330
+ type=click.Choice(("ignore", "lower", "upper", "casefold")),
331
+ default="ignore",
332
+ help="Casing applied to utterance text before phonemization.",
333
+ )
334
+ @click.option(
335
+ "--dataset-name",
336
+ "dataset_name",
337
+ default=None,
338
+ help="Name of dataset to put in config (default: name of <output_dir>/../).",
339
+ )
340
+ @click.option(
341
+ "--audio-quality",
342
+ "audio_quality",
343
+ default=None,
344
+ help="Audio quality description to put in config (default: name of <output_dir>).",
345
+ )
346
+ @click.option(
347
+ "--skip-audio",
348
+ "skip_audio",
349
+ is_flag=True,
350
+ help="Do not preprocess or cache audio files.",
351
+ )
352
+ @click.option(
353
+ "--debug",
354
+ "debug",
355
+ is_flag=True,
356
+ help="Print DEBUG messages to the console.",
357
+ )
358
+ @click.option(
359
+ "--add-diacritics",
360
+ "add_diacritics",
361
+ is_flag=True,
362
+ help="Add diacritics to text (phonemizer specific, e.g., to denote stress).",
363
+ )
364
+ def cli(
365
+ input_dir: Path,
366
+ output_dir: Path,
367
+ language: str,
368
+ sample_rate: int,
369
+ cache_dir: Optional[Path],
370
+ max_workers: Optional[int],
371
+ single_speaker: bool,
372
+ speaker_id: Optional[int],
373
+ phoneme_type: str,
374
+ alphabet: str,
375
+ phonemizer_model: str,
376
+ text_casing: str,
377
+ dataset_name: Optional[str],
378
+ audio_quality: Optional[str],
379
+ skip_audio: bool,
380
+ debug: bool,
381
+ add_diacritics: bool,
382
+ ) -> None:
383
+ """
384
+ Preprocess a TTS dataset (e.g., LJSpeech format) for training a VITS-style model.
385
+ This script handles text normalization, phonemization, and optional audio caching.
386
+ """
387
+ # Create a config object from click arguments for easier passing
388
+ config = PreprocessorConfig(
389
+ input_dir=input_dir,
390
+ output_dir=output_dir,
391
+ language=language,
392
+ sample_rate=sample_rate,
393
+ cache_dir=cache_dir or output_dir / "cache" / str(sample_rate),
394
+ max_workers=max_workers or os.cpu_count() or 1,
395
+ single_speaker=single_speaker,
396
+ speaker_id=speaker_id,
397
+ phoneme_type=PhonemeType(phoneme_type),
398
+ alphabet=Alphabet(alphabet),
399
+ phonemizer_model=phonemizer_model,
400
+ text_casing=text_casing,
401
+ dataset_name=dataset_name,
402
+ audio_quality=audio_quality,
403
+ skip_audio=skip_audio,
404
+ debug=debug,
405
+ add_diacritics=add_diacritics,
406
+ )
407
+
408
+ # Setup logging
409
+ level = logging.DEBUG if config.debug else logging.INFO
410
+ logging.basicConfig(level=level)
411
+ logging.getLogger().setLevel(level)
412
+ logging.getLogger("numba").setLevel(logging.WARNING)
413
+
414
+ # Validation
415
+ if config.single_speaker and (config.speaker_id is not None):
416
+ _LOGGER.fatal("--single-speaker and --speaker-id cannot both be provided")
417
+ raise click.Abort()
418
+
419
+ # Create directories
420
+ config.output_dir.mkdir(parents=True, exist_ok=True)
421
+ config.cache_dir.mkdir(parents=True, exist_ok=True)
422
+
423
+ # Load all utterances from the dataset
424
+ _LOGGER.info("Loading utterances from dataset...")
425
+ utterances: List[Utterance] = list(ljspeech_dataset(config))
426
+ if not utterances:
427
+ _LOGGER.error("No valid utterances found in dataset.")
428
+ return
429
+
430
+ num_utterances: int = len(utterances)
431
+ _LOGGER.info("Found %d utterances.", num_utterances)
432
+
433
+ # Count speakers and assign IDs
434
+ speaker_counts: Counter[str] = Counter(u.speaker for u in utterances if u.speaker)
435
+ is_multispeaker: bool = len(speaker_counts) > 1
436
+ speaker_ids: Dict[str, int] = {}
437
+ if is_multispeaker:
438
+ _LOGGER.info("%s speakers detected", len(speaker_counts))
439
+ # Assign speaker ids by most number of utterances first
440
+ for speaker_id, (speaker, _) in enumerate(speaker_counts.most_common()):
441
+ speaker_ids[speaker] = speaker_id
442
+ else:
443
+ _LOGGER.info("Single speaker dataset")
444
+
445
+ # --- Single Pass: Process audio/phonemes and collect results ---
446
+ _LOGGER.info("Starting single pass processing with %d workers...", config.max_workers)
447
+
448
+ # Initialize the phonemizer only once in the main process
449
+ phonemizer: Phonemizer = get_phonemizer(config.phoneme_type,
450
+ config.alphabet,
451
+ config.phonemizer_model)
452
+
453
+ batch_size: int = max(1, int(num_utterances / (config.max_workers * 2)))
454
+
455
+ task_queue: "JoinableQueue[Optional[List[Utterance]]]" = JoinableQueue()
456
+ # The result queue will hold tuples of (Utterance, set(phonemes))
457
+ result_queue: "Queue[Tuple[Optional[Utterance], Set[str]]]" = Queue()
458
+
459
+ # Start workers
460
+ processes: List[Process] = [
461
+ Process(
462
+ target=phonemize_worker,
463
+ args=(config, task_queue, result_queue, phonemizer)
464
+ )
465
+ for _ in range(config.max_workers)
466
+ ]
467
+
468
+ for proc in processes:
469
+ proc.start()
470
+
471
+ # Populate the task queue with batches
472
+ task_count: int = 0
473
+ for utt_batch in batched(utterances, batch_size):
474
+ task_queue.put(utt_batch)
475
+ task_count += len(utt_batch)
476
+
477
+ # Signal workers to stop
478
+ for _ in range(config.max_workers):
479
+ task_queue.put(None)
480
+
481
+ # Collect results from the queue with a progress bar
482
+ processed_utterances: List[Utterance] = []
483
+ all_phonemes: Set[str] = set()
484
+ for _ in tqdm(range(task_count), desc="Processing utterances"):
485
+ result: Tuple[Optional[Utterance], Set[str]] = result_queue.get()
486
+ utt, unique_phonemes = result
487
+ if utt is not None:
488
+ processed_utterances.append(utt)
489
+ all_phonemes.update(unique_phonemes)
490
+
491
+ # Wait for workers to finish
492
+ task_queue.join()
493
+ for proc in processes:
494
+ proc.join()
495
+
496
+ # --- Build the final phoneme map from the collected phonemes ---
497
+ _LOGGER.info("Building a complete phoneme map from collected phonemes...")
498
+
499
+ final_phoneme_id_map: Dict[str, int] = DEFAULT_SPECIAL_PHONEME_ID_MAP.copy()
500
+ if phonemizer.alphabet == Alphabet.IPA:
501
+ all_phonemes.update(DEFAULT_IPA_PHONEME_ID_MAP.keys())
502
+
503
+ # Filter out special tokens that are already in the map
504
+ existing_keys: Set[str] = set(final_phoneme_id_map.keys())
505
+ new_phonemes: List[str] = sorted([p for p in all_phonemes if p not in existing_keys])
506
+
507
+ current_id: int = len(final_phoneme_id_map)
508
+ for pho in new_phonemes:
509
+ final_phoneme_id_map[pho] = current_id
510
+ current_id += 1
511
+
512
+ _LOGGER.info("Final phoneme map contains %d symbols.", len(final_phoneme_id_map))
513
+
514
+ # --- Write the final config.json ---
515
+ _LOGGER.info("Writing dataset config...")
516
+ audio_quality = config.audio_quality or config.output_dir.name
517
+ dataset_name = config.dataset_name or config.output_dir.parent.name
518
+
519
+ config_data: Dict[str, Any] = {
520
+ "dataset": dataset_name,
521
+ "audio": {
522
+ "sample_rate": config.sample_rate,
523
+ "quality": audio_quality,
524
+ },
525
+ "lang_code": config.language,
526
+ "inference": {"noise_scale": 0.667,
527
+ "length_scale": 1,
528
+ "noise_w": 0.8,
529
+ "add_diacritics": config.add_diacritics},
530
+ "alphabet": phonemizer.alphabet.value,
531
+ "phoneme_type": config.phoneme_type.value,
532
+ "phonemizer_model": config.phonemizer_model,
533
+ "phoneme_id_map": final_phoneme_id_map,
534
+ "num_symbols": len(final_phoneme_id_map),
535
+ "num_speakers": len(speaker_counts) if is_multispeaker else 1,
536
+ "speaker_id_map": speaker_ids,
537
+ "phoonnx_version": VERSION_STR,
538
+ }
539
+
540
+ with open(config.output_dir / "config.json", "w", encoding="utf-8") as config_file:
541
+ json.dump(config_data, config_file, ensure_ascii=False, indent=2)
542
+
543
+ # --- Apply final phoneme IDs and write dataset.jsonl ---
544
+ _LOGGER.info("Writing dataset.jsonl...")
545
+ valid_utterances_count: int = 0
546
+ with open(config.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
547
+ for utt in processed_utterances:
548
+ if is_multispeaker and utt.speaker is not None:
549
+ if utt.speaker not in speaker_ids:
550
+ _LOGGER.error("Speaker '%s' not in speaker_id_map. This indicates an issue with your metadata.csv file.", utt.speaker)
551
+ continue
552
+ utt.speaker_id = speaker_ids[utt.speaker]
553
+
554
+ # Apply the final phoneme ID map to each utterance
555
+ if utt.phonemes:
556
+ utt.phoneme_ids = phonemes_to_ids(utt.phonemes, id_map=final_phoneme_id_map)
557
+
558
+ if not utt.phoneme_ids:
559
+ _LOGGER.warning("Skipping utterance with invalid phoneme_ids before writing: %s", utt.audio_path)
560
+ continue
561
+
562
+ json.dump(
563
+ utt.asdict(),
564
+ dataset_file,
565
+ ensure_ascii=False,
566
+ cls=PathEncoder,
567
+ )
568
+ print("", file=dataset_file)
569
+ valid_utterances_count += 1
570
+
571
+ _LOGGER.info("Preprocessing complete. Wrote %d valid utterances to dataset.jsonl.", valid_utterances_count)
572
+
573
+
574
+ # -----------------------------------------------------------------------------
575
+
576
+ def batched(iterable: Iterable[Any], n: int) -> Iterable[List[Any]]:
577
+ """
578
+ Batch data from an iterable into lists of length n. The last batch may be shorter.
579
+
580
+ Args:
581
+ iterable: The input iterable to be batched.
582
+ n: The desired size of each batch.
583
+
584
+ Yields:
585
+ List[Any]: A list representing a batch of items.
586
+ """
587
+ if n < 1:
588
+ raise ValueError("n must be at least one")
589
+ it = iter(iterable)
590
+ batch = list(itertools.islice(it, n))
591
+ while batch:
592
+ yield batch
593
+ batch = list(itertools.islice(it, n))
594
+
595
+
596
+ if __name__ == "__main__":
597
+ cli()