ttsforge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ttsforge/__init__.py +114 -0
- ttsforge/_version.py +34 -0
- ttsforge/audio_merge.py +180 -0
- ttsforge/audio_player.py +473 -0
- ttsforge/chapter_selection.py +75 -0
- ttsforge/cli/__init__.py +73 -0
- ttsforge/cli/commands_conversion.py +1927 -0
- ttsforge/cli/commands_phonemes.py +1033 -0
- ttsforge/cli/commands_utility.py +1389 -0
- ttsforge/cli/helpers.py +76 -0
- ttsforge/constants.py +164 -0
- ttsforge/conversion.py +1090 -0
- ttsforge/input_reader.py +408 -0
- ttsforge/kokoro_lang.py +12 -0
- ttsforge/kokoro_runner.py +125 -0
- ttsforge/name_extractor.py +305 -0
- ttsforge/phoneme_conversion.py +978 -0
- ttsforge/phonemes.py +486 -0
- ttsforge/ssmd_generator.py +422 -0
- ttsforge/utils.py +785 -0
- ttsforge/vocab/__init__.py +139 -0
- ttsforge-0.1.0.dist-info/METADATA +659 -0
- ttsforge-0.1.0.dist-info/RECORD +27 -0
- ttsforge-0.1.0.dist-info/WHEEL +5 -0
- ttsforge-0.1.0.dist-info/entry_points.txt +2 -0
- ttsforge-0.1.0.dist-info/licenses/LICENSE +21 -0
- ttsforge-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""Name extraction module for automatic phoneme dictionary generation.
|
|
2
|
+
|
|
3
|
+
This module extracts proper names from text and generates phoneme suggestions
|
|
4
|
+
using kokorog2p, making it easy to create custom phoneme dictionaries for books.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import functools
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, cast
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
DEFAULT_SPACY_MODEL = "en_core_web_sm"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@functools.lru_cache(maxsize=1)
|
|
22
|
+
def _get_nlp(model_name: str = DEFAULT_SPACY_MODEL) -> Any:
|
|
23
|
+
"""Load and cache the spaCy pipeline."""
|
|
24
|
+
try:
|
|
25
|
+
import spacy
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"spaCy is required for name extraction. "
|
|
29
|
+
"Install with: pip install spacy && python -m spacy download en_core_web_sm"
|
|
30
|
+
) from e
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
return spacy.load(model_name)
|
|
34
|
+
except OSError as e:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
f"spaCy model '{model_name}' not found. "
|
|
37
|
+
f"Install with: python -m spacy download {model_name}"
|
|
38
|
+
) from e
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _split_text_into_chunks(text: str, chunk_size: int = 100000) -> list[str]:
|
|
42
|
+
"""Split text into chunks at paragraph boundaries.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
text: Input text to split
|
|
46
|
+
chunk_size: Target size for each chunk in characters (default: 100000)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of text chunks
|
|
50
|
+
"""
|
|
51
|
+
# If text is small enough, return as single chunk
|
|
52
|
+
if len(text) <= chunk_size:
|
|
53
|
+
return [text]
|
|
54
|
+
|
|
55
|
+
chunks: list[str] = []
|
|
56
|
+
paragraphs = text.split("\n\n")
|
|
57
|
+
current_chunk: list[str] = []
|
|
58
|
+
current_size = 0
|
|
59
|
+
|
|
60
|
+
for para in paragraphs:
|
|
61
|
+
para_size = len(para)
|
|
62
|
+
|
|
63
|
+
# If adding this paragraph would exceed chunk size, start new chunk
|
|
64
|
+
if current_size > 0 and current_size + para_size > chunk_size:
|
|
65
|
+
chunks.append("\n\n".join(current_chunk))
|
|
66
|
+
current_chunk = [para]
|
|
67
|
+
current_size = para_size
|
|
68
|
+
else:
|
|
69
|
+
current_chunk.append(para)
|
|
70
|
+
current_size += para_size + 2 # +2 for \n\n
|
|
71
|
+
|
|
72
|
+
# Add final chunk
|
|
73
|
+
if current_chunk:
|
|
74
|
+
chunks.append("\n\n".join(current_chunk))
|
|
75
|
+
|
|
76
|
+
return chunks
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_names_from_text(
|
|
80
|
+
text: str,
|
|
81
|
+
min_count: int = 3,
|
|
82
|
+
max_names: int = 100,
|
|
83
|
+
include_all: bool = False,
|
|
84
|
+
chunk_size: int = 100000,
|
|
85
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
86
|
+
model_name: str = DEFAULT_SPACY_MODEL,
|
|
87
|
+
batch_size: int = 4,
|
|
88
|
+
) -> dict[str, int]:
|
|
89
|
+
"""Extract proper names from text using spaCy NER and POS tagging.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
text: Input text to analyze
|
|
93
|
+
min_count: Minimum occurrences for a name to be included (default: 3)
|
|
94
|
+
max_names: Maximum number of names to return (default: 100)
|
|
95
|
+
include_all: Include all capitalized proper nouns, not just PERSON entities
|
|
96
|
+
(default: False)
|
|
97
|
+
chunk_size: Size of text chunks to process at once in characters
|
|
98
|
+
(default: 100000)
|
|
99
|
+
progress_callback: Optional callback function(current, total) for
|
|
100
|
+
progress updates
|
|
101
|
+
model_name: spaCy model name to load (default: en_core_web_sm)
|
|
102
|
+
batch_size: Batch size for spaCy pipe processing
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Dictionary mapping name -> occurrence count, sorted by frequency
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
ImportError: If spaCy is not installed
|
|
109
|
+
"""
|
|
110
|
+
nlp = _get_nlp(model_name)
|
|
111
|
+
|
|
112
|
+
# Split text into manageable chunks
|
|
113
|
+
chunks = _split_text_into_chunks(text, chunk_size)
|
|
114
|
+
total_chunks = len(chunks)
|
|
115
|
+
|
|
116
|
+
logger.info(f"Processing {total_chunks} chunks of text (chunk_size={chunk_size})")
|
|
117
|
+
|
|
118
|
+
# Process each chunk and accumulate candidates
|
|
119
|
+
candidates = []
|
|
120
|
+
|
|
121
|
+
for chunk_idx, doc in enumerate(nlp.pipe(chunks, batch_size=batch_size), 1):
|
|
122
|
+
if progress_callback:
|
|
123
|
+
progress_callback(chunk_idx, total_chunks)
|
|
124
|
+
|
|
125
|
+
chunk_len = len(chunks[chunk_idx - 1])
|
|
126
|
+
logger.debug(f"Processing chunk {chunk_idx}/{total_chunks} ({chunk_len} chars)")
|
|
127
|
+
|
|
128
|
+
# Method 1: Named Entity Recognition (PERSON entities)
|
|
129
|
+
for ent in doc.ents:
|
|
130
|
+
if ent.label_ == "PERSON":
|
|
131
|
+
candidates.append(ent.text)
|
|
132
|
+
|
|
133
|
+
# Method 2: Capitalized proper nouns (if include_all=True)
|
|
134
|
+
if include_all:
|
|
135
|
+
for sent in doc.sents:
|
|
136
|
+
for token in sent:
|
|
137
|
+
# Skip first word of sentence, common words, and short words
|
|
138
|
+
if (
|
|
139
|
+
token.i != sent.start
|
|
140
|
+
and token.text[0].isupper()
|
|
141
|
+
and token.pos_ == "PROPN"
|
|
142
|
+
and len(token.text) > 2
|
|
143
|
+
):
|
|
144
|
+
candidates.append(token.text)
|
|
145
|
+
|
|
146
|
+
# Count occurrences across all chunks
|
|
147
|
+
name_counts = Counter(candidates)
|
|
148
|
+
|
|
149
|
+
# Filter by frequency and limit
|
|
150
|
+
filtered = {
|
|
151
|
+
name: count
|
|
152
|
+
for name, count in name_counts.most_common(max_names)
|
|
153
|
+
if count >= min_count
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
logger.info(
|
|
157
|
+
f"Extracted {len(filtered)} names from {total_chunks} chunks "
|
|
158
|
+
f"(min_count={min_count}, max={max_names})"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return filtered
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def generate_phoneme_suggestions(
|
|
165
|
+
names: dict[str, int], language: str = "en-us"
|
|
166
|
+
) -> dict[str, dict[str, Any]]:
|
|
167
|
+
"""Generate phoneme suggestions for a list of names.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
names: Dictionary of name -> occurrence count
|
|
171
|
+
language: Language code for phonemization (default: 'en-us')
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dictionary with phoneme suggestions and metadata:
|
|
175
|
+
{
|
|
176
|
+
"name": {
|
|
177
|
+
"phoneme": "/phoneme/",
|
|
178
|
+
"occurrences": count,
|
|
179
|
+
"suggestion_quality": "auto"
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
"""
|
|
183
|
+
from kokorog2p import phonemize
|
|
184
|
+
|
|
185
|
+
suggestions: dict[str, dict[str, Any]] = {}
|
|
186
|
+
|
|
187
|
+
for name, count in names.items():
|
|
188
|
+
try:
|
|
189
|
+
# Generate phoneme using kokorog2p
|
|
190
|
+
phoneme = phonemize(name, language=language).phonemes
|
|
191
|
+
|
|
192
|
+
# Wrap in / / format for dictionary
|
|
193
|
+
phoneme_formatted = f"/{phoneme}/"
|
|
194
|
+
|
|
195
|
+
suggestions[name] = {
|
|
196
|
+
"phoneme": phoneme_formatted,
|
|
197
|
+
"occurrences": count,
|
|
198
|
+
"suggestion_quality": "auto",
|
|
199
|
+
}
|
|
200
|
+
except Exception as e:
|
|
201
|
+
logger.warning(f"Failed to generate phoneme for '{name}': {e}")
|
|
202
|
+
# Add placeholder
|
|
203
|
+
suggestions[name] = {
|
|
204
|
+
"phoneme": "/FIXME/",
|
|
205
|
+
"occurrences": count,
|
|
206
|
+
"suggestion_quality": "error",
|
|
207
|
+
"error": str(e),
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return suggestions
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def save_phoneme_dictionary(
|
|
214
|
+
names_with_phonemes: dict[str, dict],
|
|
215
|
+
output_path: Path,
|
|
216
|
+
source_file: str | None = None,
|
|
217
|
+
language: str = "en-us",
|
|
218
|
+
) -> None:
|
|
219
|
+
"""Save phoneme dictionary to JSON file with metadata.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
names_with_phonemes: Dictionary from generate_phoneme_suggestions()
|
|
223
|
+
output_path: Path to save JSON file
|
|
224
|
+
source_file: Optional source file name for metadata
|
|
225
|
+
language: Language code for metadata
|
|
226
|
+
"""
|
|
227
|
+
metadata = {
|
|
228
|
+
"generated_at": datetime.now().isoformat(),
|
|
229
|
+
"language": language,
|
|
230
|
+
"total_names": len(names_with_phonemes),
|
|
231
|
+
"note": (
|
|
232
|
+
"Review and edit phonemes before using. "
|
|
233
|
+
"Auto-generated suggestions may need correction."
|
|
234
|
+
),
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if source_file:
|
|
238
|
+
metadata["generated_from"] = source_file
|
|
239
|
+
|
|
240
|
+
output_data = {"_metadata": metadata, "entries": names_with_phonemes}
|
|
241
|
+
|
|
242
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
243
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
244
|
+
|
|
245
|
+
logger.info(f"Saved phoneme dictionary to {output_path}")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def load_simple_dictionary(file_path: Path) -> dict[str, dict[str, Any]]:
|
|
249
|
+
"""Load a simple phoneme dictionary and convert to metadata format.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
file_path: Path to JSON dictionary file
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dictionary in metadata format (for editing/merging)
|
|
256
|
+
"""
|
|
257
|
+
with open(file_path, encoding="utf-8") as f:
|
|
258
|
+
data = json.load(f)
|
|
259
|
+
|
|
260
|
+
# If already in metadata format, return as-is
|
|
261
|
+
if "_metadata" in data and "entries" in data:
|
|
262
|
+
return cast(dict[str, dict[str, Any]], data["entries"])
|
|
263
|
+
|
|
264
|
+
# Convert simple format to metadata format
|
|
265
|
+
entries: dict[str, dict[str, Any]] = {}
|
|
266
|
+
for name, phoneme in data.items():
|
|
267
|
+
if isinstance(phoneme, str):
|
|
268
|
+
entries[name] = {"phoneme": phoneme, "verified": False}
|
|
269
|
+
elif isinstance(phoneme, dict):
|
|
270
|
+
entries[name] = phoneme
|
|
271
|
+
else:
|
|
272
|
+
logger.warning(f"Skipping invalid entry: {name} -> {phoneme}")
|
|
273
|
+
|
|
274
|
+
return entries
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def merge_dictionaries(
|
|
278
|
+
auto_generated: dict[str, dict[str, Any]], manual: dict[str, dict[str, Any]]
|
|
279
|
+
) -> dict[str, dict[str, Any]]:
|
|
280
|
+
"""Merge auto-generated dictionary with manual corrections.
|
|
281
|
+
|
|
282
|
+
Manual entries take precedence over auto-generated ones.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
auto_generated: Auto-generated phoneme dictionary
|
|
286
|
+
manual: Manually created/edited dictionary
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Merged dictionary
|
|
290
|
+
"""
|
|
291
|
+
merged = auto_generated.copy()
|
|
292
|
+
|
|
293
|
+
for name, entry in manual.items():
|
|
294
|
+
if name in merged:
|
|
295
|
+
# Update with manual entry, preserving occurrence count
|
|
296
|
+
merged[name] = {
|
|
297
|
+
**merged[name], # Keep occurrences
|
|
298
|
+
**entry, # Override with manual data
|
|
299
|
+
"verified": True,
|
|
300
|
+
}
|
|
301
|
+
else:
|
|
302
|
+
# New manual entry
|
|
303
|
+
merged[name] = {**entry, "verified": True}
|
|
304
|
+
|
|
305
|
+
return merged
|