ttsforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ttsforge/phonemes.py ADDED
@@ -0,0 +1,486 @@
1
+ """Phoneme data structures for ttsforge.
2
+
3
+ This module provides data structures for storing and manipulating
4
+ pre-tokenized book content (phonemes and tokens).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from collections.abc import Callable, Iterator
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any
14
+
15
+ if TYPE_CHECKING:
16
+ from pykokoro.tokenizer import Tokenizer
17
+
18
+
19
+ @dataclass
20
+ class PhonemeSegment:
21
+ """A segment of text with its phoneme representation."""
22
+
23
+ text: str
24
+ phonemes: str
25
+ tokens: list[int]
26
+ lang: str = "en-us"
27
+ paragraph: int = 0
28
+ sentence: int | None = None
29
+ pause_after: float = 0.0
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ """Convert to dictionary for JSON serialization."""
33
+ data: dict[str, Any] = {
34
+ "text": self.text,
35
+ "phonemes": self.phonemes,
36
+ "tokens": self.tokens,
37
+ "lang": self.lang,
38
+ }
39
+ if self.paragraph:
40
+ data["paragraph"] = self.paragraph
41
+ if self.sentence is not None:
42
+ data["sentence"] = self.sentence
43
+ if self.pause_after:
44
+ data["pause_after"] = self.pause_after
45
+ return data
46
+
47
+ @classmethod
48
+ def from_dict(cls, data: dict[str, Any]) -> PhonemeSegment:
49
+ """Create from dictionary."""
50
+ return cls(
51
+ text=data["text"],
52
+ phonemes=data["phonemes"],
53
+ tokens=list(data.get("tokens", [])),
54
+ lang=data.get("lang", "en-us"),
55
+ paragraph=data.get("paragraph", 0),
56
+ sentence=data.get("sentence"),
57
+ pause_after=data.get("pause_after", 0.0),
58
+ )
59
+
60
+ def format_readable(self) -> str:
61
+ """Format as human-readable string: text [phonemes]."""
62
+ return f"{self.text} [{self.phonemes}]"
63
+
64
+
65
+ # Version of the phoneme export format
66
+ FORMAT_VERSION = "1.0"
67
+
68
+
69
+ @dataclass
70
+ class PhonemeChapter:
71
+ """A chapter containing phoneme segments.
72
+
73
+ Attributes:
74
+ title: Chapter title
75
+ segments: List of phoneme segments
76
+ chapter_index: Chapter number (0-based)
77
+ """
78
+
79
+ title: str
80
+ segments: list[PhonemeSegment] = field(default_factory=list)
81
+ chapter_index: int = 0
82
+
83
+ def add_segment(self, segment: PhonemeSegment) -> None:
84
+ """Add a segment to the chapter."""
85
+ self.segments.append(segment)
86
+
87
+ def add_text(
88
+ self,
89
+ text: str,
90
+ tokenizer: Tokenizer,
91
+ lang: str = "en-us",
92
+ split_mode: str = "sentence",
93
+ max_chars: int = 300,
94
+ language_model: str = "en_core_web_sm",
95
+ max_phoneme_length: int = 510,
96
+ warn_callback: Callable[[str], None] | None = None,
97
+ ) -> list[PhonemeSegment]:
98
+ """Add text by phonemizing it.
99
+
100
+ Text is split according to split_mode before phonemization to create
101
+ natural segment boundaries and avoid exceeding the tokenizer's maximum
102
+ phoneme length.
103
+
104
+ Args:
105
+ text: Text to add
106
+ tokenizer: Tokenizer instance
107
+ lang: Language code for phonemization
108
+ split_mode: How to split the text:
109
+ - "paragraph": Split on double newlines only
110
+ - "sentence": Split on sentence boundaries (using spaCy)
111
+ - "clause": Split on sentences + commas for finer segments
112
+ max_chars: Maximum characters per segment (default 300, used for
113
+ further splitting if segments are too long)
114
+ language_model: spaCy language model for sentence/clause splitting
115
+ max_phoneme_length: Maximum phoneme length (default 510, Kokoro limit)
116
+ warn_callback: Optional callback for warnings (receives warning message)
117
+
118
+ Returns:
119
+ List of created PhonemeSegments
120
+ """
121
+ import re
122
+
123
+ from phrasplit import split_long_lines, split_text
124
+
125
+ # Safety filter: Remove <<CHAPTER: ...>> markers that epub2text might add
126
+ # This provides defense-in-depth even if callers forget to filter
127
+ text = re.sub(
128
+ r"^\s*<<CHAPTER:[^>]*>>\s*\n*", "", text, count=1, flags=re.MULTILINE
129
+ )
130
+
131
+ def warn(msg: str) -> None:
132
+ """Issue a warning."""
133
+ if warn_callback:
134
+ warn_callback(msg)
135
+
136
+ def phonemize_with_split(
137
+ chunk: str,
138
+ current_max_chars: int,
139
+ paragraph_idx: int = 0,
140
+ sentence_idx: int | None = None,
141
+ ) -> list[PhonemeSegment]:
142
+ """Phonemize a chunk, splitting further if phonemes exceed limit."""
143
+ chunk = chunk.strip()
144
+ if not chunk:
145
+ return []
146
+
147
+ phonemes = tokenizer.phonemize(chunk, lang=lang)
148
+
149
+ # Check if phonemes exceed limit
150
+ if len(phonemes) > max_phoneme_length:
151
+ # Try splitting further if we have room
152
+ if current_max_chars > 50:
153
+ # Reduce max_chars and retry
154
+ new_max_chars = current_max_chars // 2
155
+ sub_chunks = split_long_lines(chunk, new_max_chars, language_model)
156
+ results = []
157
+ for sub in sub_chunks:
158
+ results.extend(
159
+ phonemize_with_split(
160
+ sub, new_max_chars, paragraph_idx, sentence_idx
161
+ )
162
+ )
163
+ return results
164
+ else:
165
+ # Can't split further - warn and truncate
166
+ warn(
167
+ f"Segment phonemes too long ({len(phonemes)} > "
168
+ f"{max_phoneme_length}), truncating. Text: '{chunk[:50]}...'"
169
+ )
170
+ # Truncate phonemes to limit
171
+ phonemes = phonemes[:max_phoneme_length]
172
+
173
+ tokens = tokenizer.tokenize(phonemes)
174
+ return [
175
+ PhonemeSegment(
176
+ text=chunk,
177
+ phonemes=phonemes,
178
+ tokens=tokens,
179
+ lang=lang,
180
+ paragraph=paragraph_idx,
181
+ sentence=sentence_idx,
182
+ )
183
+ ]
184
+
185
+ # Use the new unified split_text function
186
+ if split_mode in ["paragraph", "sentence", "clause"]:
187
+ phrasplit_segments = split_text(
188
+ text,
189
+ mode=split_mode,
190
+ language_model=language_model,
191
+ apply_corrections=True,
192
+ split_on_colon=True,
193
+ )
194
+ else:
195
+ # Default: treat as single chunk with paragraph 0
196
+ from phrasplit import Segment
197
+
198
+ phrasplit_segments = (
199
+ [Segment(text=text, paragraph=0, sentence=0)] if text.strip() else []
200
+ )
201
+
202
+ segments = []
203
+
204
+ for phrasplit_seg in phrasplit_segments:
205
+ chunk = phrasplit_seg.text.strip()
206
+ if not chunk:
207
+ continue
208
+
209
+ # If chunk is still too long, split it further
210
+ if len(chunk) > max_chars:
211
+ sub_chunks = split_long_lines(chunk, max_chars, language_model)
212
+ else:
213
+ sub_chunks = [chunk]
214
+
215
+ for sub_chunk in sub_chunks:
216
+ new_segments = phonemize_with_split(
217
+ sub_chunk,
218
+ max_chars,
219
+ paragraph_idx=phrasplit_seg.paragraph,
220
+ sentence_idx=phrasplit_seg.sentence,
221
+ )
222
+ for seg in new_segments:
223
+ self.segments.append(seg)
224
+ segments.append(seg)
225
+
226
+ return segments
227
+
228
+ @property
229
+ def total_tokens(self) -> int:
230
+ """Total number of tokens in this chapter."""
231
+ return sum(len(s.tokens) for s in self.segments)
232
+
233
+ @property
234
+ def total_phonemes(self) -> int:
235
+ """Total number of phoneme characters in this chapter."""
236
+ return sum(len(s.phonemes) for s in self.segments)
237
+
238
+ def to_dict(self) -> dict[str, Any]:
239
+ """Convert to dictionary for JSON serialization."""
240
+ return {
241
+ "title": self.title,
242
+ "chapter_index": self.chapter_index,
243
+ "segments": [s.to_dict() for s in self.segments],
244
+ }
245
+
246
+ @classmethod
247
+ def from_dict(cls, data: dict[str, Any]) -> PhonemeChapter:
248
+ """Create from dictionary."""
249
+ chapter = cls(
250
+ title=data["title"],
251
+ chapter_index=data.get("chapter_index", 0),
252
+ )
253
+ for seg_data in data.get("segments", []):
254
+ chapter.segments.append(PhonemeSegment.from_dict(seg_data))
255
+ return chapter
256
+
257
+ def format_readable(self) -> str:
258
+ """Format as human-readable string."""
259
+ lines = [f"# {self.title}", ""]
260
+ for segment in self.segments:
261
+ lines.append(segment.format_readable())
262
+ return "\n".join(lines)
263
+
264
+ def iter_segments(self) -> Iterator[PhonemeSegment]:
265
+ """Iterate over segments."""
266
+ yield from self.segments
267
+
268
+
269
+ @dataclass
270
+ class PhonemeBook:
271
+ """A book containing multiple chapters of phoneme data.
272
+
273
+ Attributes:
274
+ title: Book title
275
+ chapters: List of chapters
276
+ metadata: Additional metadata
277
+ vocab_version: Vocabulary version used for tokenization
278
+ lang: Default language code
279
+ """
280
+
281
+ title: str
282
+ chapters: list[PhonemeChapter] = field(default_factory=list)
283
+ metadata: dict[str, Any] = field(default_factory=dict)
284
+ vocab_version: str = "v1.0"
285
+ lang: str = "en-us"
286
+
287
+ def add_chapter(self, chapter: PhonemeChapter) -> None:
288
+ """Add a chapter to the book."""
289
+ chapter.chapter_index = len(self.chapters)
290
+ self.chapters.append(chapter)
291
+
292
+ def create_chapter(self, title: str) -> PhonemeChapter:
293
+ """Create and add a new chapter.
294
+
295
+ Args:
296
+ title: Chapter title
297
+
298
+ Returns:
299
+ The created PhonemeChapter
300
+ """
301
+ chapter = PhonemeChapter(title=title, chapter_index=len(self.chapters))
302
+ self.chapters.append(chapter)
303
+ return chapter
304
+
305
+ @property
306
+ def total_tokens(self) -> int:
307
+ """Total number of tokens in the book."""
308
+ return sum(c.total_tokens for c in self.chapters)
309
+
310
+ @property
311
+ def total_phonemes(self) -> int:
312
+ """Total number of phoneme characters in the book."""
313
+ return sum(c.total_phonemes for c in self.chapters)
314
+
315
+ @property
316
+ def total_segments(self) -> int:
317
+ """Total number of segments in the book."""
318
+ return sum(len(c.segments) for c in self.chapters)
319
+
320
+ def to_dict(self) -> dict[str, Any]:
321
+ """Convert to dictionary for JSON serialization."""
322
+ return {
323
+ "format_version": FORMAT_VERSION,
324
+ "title": self.title,
325
+ "vocab_version": self.vocab_version,
326
+ "lang": self.lang,
327
+ "metadata": self.metadata,
328
+ "stats": {
329
+ "total_chapters": len(self.chapters),
330
+ "total_segments": self.total_segments,
331
+ "total_tokens": self.total_tokens,
332
+ "total_phonemes": self.total_phonemes,
333
+ },
334
+ "chapters": [c.to_dict() for c in self.chapters],
335
+ }
336
+
337
+ @classmethod
338
+ def from_dict(cls, data: dict[str, Any]) -> PhonemeBook:
339
+ """Create from dictionary."""
340
+ book = cls(
341
+ title=data["title"],
342
+ vocab_version=data.get("vocab_version", "v1.0"),
343
+ lang=data.get("lang", "en-us"),
344
+ metadata=data.get("metadata", {}),
345
+ )
346
+ for ch_data in data.get("chapters", []):
347
+ book.chapters.append(PhonemeChapter.from_dict(ch_data))
348
+ return book
349
+
350
+ def save(self, path: str | Path, indent: int = 2) -> None:
351
+ """Save to JSON file.
352
+
353
+ Args:
354
+ path: Output file path
355
+ indent: JSON indentation (use None for compact output)
356
+ """
357
+ path = Path(path)
358
+ with open(path, "w", encoding="utf-8") as f:
359
+ json.dump(self.to_dict(), f, indent=indent, ensure_ascii=False)
360
+
361
+ @classmethod
362
+ def load(cls, path: str | Path) -> PhonemeBook:
363
+ """Load from JSON file.
364
+
365
+ Args:
366
+ path: Input file path
367
+
368
+ Returns:
369
+ PhonemeBook instance
370
+ """
371
+ path = Path(path)
372
+ with open(path, encoding="utf-8") as f:
373
+ data = json.load(f)
374
+ return cls.from_dict(data)
375
+
376
+ def save_readable(self, path: str | Path) -> None:
377
+ """Save as human-readable text file.
378
+
379
+ Format: text [phonemes]
380
+
381
+ Args:
382
+ path: Output file path
383
+ """
384
+ path = Path(path)
385
+ with open(path, "w", encoding="utf-8") as f:
386
+ f.write(f"# {self.title}\n")
387
+ f.write(f"# Vocabulary: {self.vocab_version}\n")
388
+ f.write(f"# Language: {self.lang}\n\n")
389
+
390
+ for chapter in self.chapters:
391
+ f.write(chapter.format_readable())
392
+ f.write("\n\n")
393
+
394
+ def iter_segments(self) -> Iterator[tuple[int, PhonemeSegment]]:
395
+ """Iterate over all segments with chapter index.
396
+
397
+ Yields:
398
+ Tuples of (chapter_index, segment)
399
+ """
400
+ for chapter in self.chapters:
401
+ for segment in chapter.segments:
402
+ yield chapter.chapter_index, segment
403
+
404
+ def iter_chapters(self) -> Iterator[PhonemeChapter]:
405
+ """Iterate over chapters."""
406
+ yield from self.chapters
407
+
408
+ def get_info(self) -> dict[str, Any]:
409
+ """Get summary information about the book.
410
+
411
+ Returns:
412
+ Dictionary with book statistics
413
+ """
414
+ return {
415
+ "title": self.title,
416
+ "vocab_version": self.vocab_version,
417
+ "lang": self.lang,
418
+ "chapters": len(self.chapters),
419
+ "segments": self.total_segments,
420
+ "tokens": self.total_tokens,
421
+ "phonemes": self.total_phonemes,
422
+ "metadata": self.metadata,
423
+ }
424
+
425
+
426
+ def phonemize_text_list(
427
+ texts: list[str],
428
+ tokenizer: Tokenizer,
429
+ lang: str = "en-us",
430
+ ) -> list[PhonemeSegment]:
431
+ """Phonemize a list of texts.
432
+
433
+ Args:
434
+ texts: List of text strings
435
+ tokenizer: Tokenizer instance
436
+ lang: Language code
437
+
438
+ Returns:
439
+ List of PhonemeSegment instances
440
+ """
441
+ segments = []
442
+ for text in texts:
443
+ phonemes = tokenizer.phonemize(text, lang=lang)
444
+ tokens = tokenizer.tokenize(phonemes)
445
+ segments.append(
446
+ PhonemeSegment(
447
+ text=text,
448
+ phonemes=phonemes,
449
+ tokens=tokens,
450
+ lang=lang,
451
+ )
452
+ )
453
+ return segments
454
+
455
+
456
+ def create_phoneme_book_from_chapters(
457
+ title: str,
458
+ chapters: list[tuple[str, list[str]]],
459
+ tokenizer: Tokenizer,
460
+ lang: str = "en-us",
461
+ vocab_version: str = "v1.0",
462
+ ) -> PhonemeBook:
463
+ """Create a PhonemeBook from chapter data.
464
+
465
+ Args:
466
+ title: Book title
467
+ chapters: List of (chapter_title, paragraphs) tuples
468
+ tokenizer: Tokenizer instance
469
+ lang: Language code
470
+ vocab_version: Vocabulary version
471
+
472
+ Returns:
473
+ PhonemeBook instance
474
+ """
475
+ book = PhonemeBook(
476
+ title=title,
477
+ vocab_version=vocab_version,
478
+ lang=lang,
479
+ )
480
+
481
+ for chapter_title, paragraphs in chapters:
482
+ chapter = book.create_chapter(chapter_title)
483
+ for para in paragraphs:
484
+ chapter.add_text(para, tokenizer, lang=lang)
485
+
486
+ return book