ttsforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,305 @@
1
+ """Name extraction module for automatic phoneme dictionary generation.
2
+
3
+ This module extracts proper names from text and generates phoneme suggestions
4
+ using kokorog2p, making it easy to create custom phoneme dictionaries for books.
5
+ """
6
+
7
+ import functools
8
+ import json
9
+ import logging
10
+ from collections import Counter
11
+ from collections.abc import Callable
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Any, cast
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ DEFAULT_SPACY_MODEL = "en_core_web_sm"
19
+
20
+
21
+ @functools.lru_cache(maxsize=1)
22
+ def _get_nlp(model_name: str = DEFAULT_SPACY_MODEL) -> Any:
23
+ """Load and cache the spaCy pipeline."""
24
+ try:
25
+ import spacy
26
+ except ImportError as e:
27
+ raise ImportError(
28
+ "spaCy is required for name extraction. "
29
+ "Install with: pip install spacy && python -m spacy download en_core_web_sm"
30
+ ) from e
31
+
32
+ try:
33
+ return spacy.load(model_name)
34
+ except OSError as e:
35
+ raise ImportError(
36
+ f"spaCy model '{model_name}' not found. "
37
+ f"Install with: python -m spacy download {model_name}"
38
+ ) from e
39
+
40
+
41
+ def _split_text_into_chunks(text: str, chunk_size: int = 100000) -> list[str]:
42
+ """Split text into chunks at paragraph boundaries.
43
+
44
+ Args:
45
+ text: Input text to split
46
+ chunk_size: Target size for each chunk in characters (default: 100000)
47
+
48
+ Returns:
49
+ List of text chunks
50
+ """
51
+ # If text is small enough, return as single chunk
52
+ if len(text) <= chunk_size:
53
+ return [text]
54
+
55
+ chunks: list[str] = []
56
+ paragraphs = text.split("\n\n")
57
+ current_chunk: list[str] = []
58
+ current_size = 0
59
+
60
+ for para in paragraphs:
61
+ para_size = len(para)
62
+
63
+ # If adding this paragraph would exceed chunk size, start new chunk
64
+ if current_size > 0 and current_size + para_size > chunk_size:
65
+ chunks.append("\n\n".join(current_chunk))
66
+ current_chunk = [para]
67
+ current_size = para_size
68
+ else:
69
+ current_chunk.append(para)
70
+ current_size += para_size + 2 # +2 for \n\n
71
+
72
+ # Add final chunk
73
+ if current_chunk:
74
+ chunks.append("\n\n".join(current_chunk))
75
+
76
+ return chunks
77
+
78
+
79
+ def extract_names_from_text(
80
+ text: str,
81
+ min_count: int = 3,
82
+ max_names: int = 100,
83
+ include_all: bool = False,
84
+ chunk_size: int = 100000,
85
+ progress_callback: Callable[[int, int], None] | None = None,
86
+ model_name: str = DEFAULT_SPACY_MODEL,
87
+ batch_size: int = 4,
88
+ ) -> dict[str, int]:
89
+ """Extract proper names from text using spaCy NER and POS tagging.
90
+
91
+ Args:
92
+ text: Input text to analyze
93
+ min_count: Minimum occurrences for a name to be included (default: 3)
94
+ max_names: Maximum number of names to return (default: 100)
95
+ include_all: Include all capitalized proper nouns, not just PERSON entities
96
+ (default: False)
97
+ chunk_size: Size of text chunks to process at once in characters
98
+ (default: 100000)
99
+ progress_callback: Optional callback function(current, total) for
100
+ progress updates
101
+ model_name: spaCy model name to load (default: en_core_web_sm)
102
+ batch_size: Batch size for spaCy pipe processing
103
+
104
+ Returns:
105
+ Dictionary mapping name -> occurrence count, sorted by frequency
106
+
107
+ Raises:
108
+ ImportError: If spaCy is not installed
109
+ """
110
+ nlp = _get_nlp(model_name)
111
+
112
+ # Split text into manageable chunks
113
+ chunks = _split_text_into_chunks(text, chunk_size)
114
+ total_chunks = len(chunks)
115
+
116
+ logger.info(f"Processing {total_chunks} chunks of text (chunk_size={chunk_size})")
117
+
118
+ # Process each chunk and accumulate candidates
119
+ candidates = []
120
+
121
+ for chunk_idx, doc in enumerate(nlp.pipe(chunks, batch_size=batch_size), 1):
122
+ if progress_callback:
123
+ progress_callback(chunk_idx, total_chunks)
124
+
125
+ chunk_len = len(chunks[chunk_idx - 1])
126
+ logger.debug(f"Processing chunk {chunk_idx}/{total_chunks} ({chunk_len} chars)")
127
+
128
+ # Method 1: Named Entity Recognition (PERSON entities)
129
+ for ent in doc.ents:
130
+ if ent.label_ == "PERSON":
131
+ candidates.append(ent.text)
132
+
133
+ # Method 2: Capitalized proper nouns (if include_all=True)
134
+ if include_all:
135
+ for sent in doc.sents:
136
+ for token in sent:
137
+ # Skip first word of sentence, common words, and short words
138
+ if (
139
+ token.i != sent.start
140
+ and token.text[0].isupper()
141
+ and token.pos_ == "PROPN"
142
+ and len(token.text) > 2
143
+ ):
144
+ candidates.append(token.text)
145
+
146
+ # Count occurrences across all chunks
147
+ name_counts = Counter(candidates)
148
+
149
+ # Filter by frequency and limit
150
+ filtered = {
151
+ name: count
152
+ for name, count in name_counts.most_common(max_names)
153
+ if count >= min_count
154
+ }
155
+
156
+ logger.info(
157
+ f"Extracted {len(filtered)} names from {total_chunks} chunks "
158
+ f"(min_count={min_count}, max={max_names})"
159
+ )
160
+
161
+ return filtered
162
+
163
+
164
+ def generate_phoneme_suggestions(
165
+ names: dict[str, int], language: str = "en-us"
166
+ ) -> dict[str, dict[str, Any]]:
167
+ """Generate phoneme suggestions for a list of names.
168
+
169
+ Args:
170
+ names: Dictionary of name -> occurrence count
171
+ language: Language code for phonemization (default: 'en-us')
172
+
173
+ Returns:
174
+ Dictionary with phoneme suggestions and metadata:
175
+ {
176
+ "name": {
177
+ "phoneme": "/phoneme/",
178
+ "occurrences": count,
179
+ "suggestion_quality": "auto"
180
+ }
181
+ }
182
+ """
183
+ from kokorog2p import phonemize
184
+
185
+ suggestions: dict[str, dict[str, Any]] = {}
186
+
187
+ for name, count in names.items():
188
+ try:
189
+ # Generate phoneme using kokorog2p
190
+ phoneme = phonemize(name, language=language).phonemes
191
+
192
+ # Wrap in / / format for dictionary
193
+ phoneme_formatted = f"/{phoneme}/"
194
+
195
+ suggestions[name] = {
196
+ "phoneme": phoneme_formatted,
197
+ "occurrences": count,
198
+ "suggestion_quality": "auto",
199
+ }
200
+ except Exception as e:
201
+ logger.warning(f"Failed to generate phoneme for '{name}': {e}")
202
+ # Add placeholder
203
+ suggestions[name] = {
204
+ "phoneme": "/FIXME/",
205
+ "occurrences": count,
206
+ "suggestion_quality": "error",
207
+ "error": str(e),
208
+ }
209
+
210
+ return suggestions
211
+
212
+
213
+ def save_phoneme_dictionary(
214
+ names_with_phonemes: dict[str, dict],
215
+ output_path: Path,
216
+ source_file: str | None = None,
217
+ language: str = "en-us",
218
+ ) -> None:
219
+ """Save phoneme dictionary to JSON file with metadata.
220
+
221
+ Args:
222
+ names_with_phonemes: Dictionary from generate_phoneme_suggestions()
223
+ output_path: Path to save JSON file
224
+ source_file: Optional source file name for metadata
225
+ language: Language code for metadata
226
+ """
227
+ metadata = {
228
+ "generated_at": datetime.now().isoformat(),
229
+ "language": language,
230
+ "total_names": len(names_with_phonemes),
231
+ "note": (
232
+ "Review and edit phonemes before using. "
233
+ "Auto-generated suggestions may need correction."
234
+ ),
235
+ }
236
+
237
+ if source_file:
238
+ metadata["generated_from"] = source_file
239
+
240
+ output_data = {"_metadata": metadata, "entries": names_with_phonemes}
241
+
242
+ with open(output_path, "w", encoding="utf-8") as f:
243
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
244
+
245
+ logger.info(f"Saved phoneme dictionary to {output_path}")
246
+
247
+
248
+ def load_simple_dictionary(file_path: Path) -> dict[str, dict[str, Any]]:
249
+ """Load a simple phoneme dictionary and convert to metadata format.
250
+
251
+ Args:
252
+ file_path: Path to JSON dictionary file
253
+
254
+ Returns:
255
+ Dictionary in metadata format (for editing/merging)
256
+ """
257
+ with open(file_path, encoding="utf-8") as f:
258
+ data = json.load(f)
259
+
260
+ # If already in metadata format, return as-is
261
+ if "_metadata" in data and "entries" in data:
262
+ return cast(dict[str, dict[str, Any]], data["entries"])
263
+
264
+ # Convert simple format to metadata format
265
+ entries: dict[str, dict[str, Any]] = {}
266
+ for name, phoneme in data.items():
267
+ if isinstance(phoneme, str):
268
+ entries[name] = {"phoneme": phoneme, "verified": False}
269
+ elif isinstance(phoneme, dict):
270
+ entries[name] = phoneme
271
+ else:
272
+ logger.warning(f"Skipping invalid entry: {name} -> {phoneme}")
273
+
274
+ return entries
275
+
276
+
277
+ def merge_dictionaries(
278
+ auto_generated: dict[str, dict[str, Any]], manual: dict[str, dict[str, Any]]
279
+ ) -> dict[str, dict[str, Any]]:
280
+ """Merge auto-generated dictionary with manual corrections.
281
+
282
+ Manual entries take precedence over auto-generated ones.
283
+
284
+ Args:
285
+ auto_generated: Auto-generated phoneme dictionary
286
+ manual: Manually created/edited dictionary
287
+
288
+ Returns:
289
+ Merged dictionary
290
+ """
291
+ merged = auto_generated.copy()
292
+
293
+ for name, entry in manual.items():
294
+ if name in merged:
295
+ # Update with manual entry, preserving occurrence count
296
+ merged[name] = {
297
+ **merged[name], # Keep occurrences
298
+ **entry, # Override with manual data
299
+ "verified": True,
300
+ }
301
+ else:
302
+ # New manual entry
303
+ merged[name] = {**entry, "verified": True}
304
+
305
+ return merged