ttsforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,978 @@
1
+ """Phoneme-based TTS conversion module for ttsforge.
2
+
3
+ This module converts pre-tokenized PhonemeBook files to audio,
4
+ bypassing text-to-phoneme conversion since phonemes/tokens are pre-computed.
5
+ """
6
+
7
+ import json
8
+ import subprocess
9
+ import threading
10
+ import time
11
+ from collections.abc import Callable
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import Any, Literal, Optional, cast
15
+
16
+ import numpy as np
17
+ import soundfile as sf
18
+
19
+ from .audio_merge import AudioMerger, MergeMeta
20
+ from .chapter_selection import parse_chapter_selection
21
+ from .constants import SAMPLE_RATE, SUPPORTED_OUTPUT_FORMATS
22
+ from .kokoro_lang import get_onnx_lang_code
23
+ from .kokoro_runner import KokoroRunner, KokoroRunOptions
24
+ from .phonemes import PhonemeBook, PhonemeChapter, PhonemeSegment
25
+ from .utils import (
26
+ atomic_write_json,
27
+ create_process,
28
+ format_duration,
29
+ format_filename_template,
30
+ get_ffmpeg_path,
31
+ prevent_sleep_end,
32
+ prevent_sleep_start,
33
+ sanitize_filename,
34
+ )
35
+
36
+
37
+ @dataclass
38
+ class PhonemeConversionProgress:
39
+ """Progress information during phoneme conversion."""
40
+
41
+ current_chapter: int = 0
42
+ total_chapters: int = 0
43
+ chapter_name: str = ""
44
+ current_segment: int = 0
45
+ total_segments: int = 0
46
+ segments_processed: int = 0 # Global segment count
47
+ total_segments_all: int = 0 # Total segments across all chapters
48
+ current_text: str = ""
49
+ elapsed_time: float = 0.0
50
+ estimated_remaining: float = 0.0
51
+
52
+ @property
53
+ def percent(self) -> int:
54
+ if self.total_segments_all == 0:
55
+ return 0
56
+ return min(int(self.segments_processed / self.total_segments_all * 100), 99)
57
+
58
+ @property
59
+ def etr_formatted(self) -> str:
60
+ return format_duration(self.estimated_remaining)
61
+
62
+
63
+ @dataclass
64
+ class PhonemeConversionResult:
65
+ """Result of a phoneme conversion operation."""
66
+
67
+ success: bool
68
+ output_path: Path | None = None
69
+ error_message: str | None = None
70
+ chapters_dir: Path | None = None
71
+ duration: float = 0.0
72
+
73
+
74
+ @dataclass
75
+ class PhonemeChapterState:
76
+ """State of a single chapter conversion."""
77
+
78
+ index: int
79
+ title: str
80
+ segment_count: int
81
+ completed: bool = False
82
+ audio_file: str | None = None # Relative path to chapter audio
83
+ duration: float = 0.0
84
+
85
+
86
+ @dataclass
87
+ class PhonemeConversionState:
88
+ """Persistent state for resumable phoneme conversions."""
89
+
90
+ version: int = 1
91
+ source_file: str = ""
92
+ output_file: str = ""
93
+ work_dir: str = ""
94
+ voice: str = ""
95
+ speed: float = 1.0
96
+ output_format: str = "m4b"
97
+ silence_between_chapters: float = 2.0
98
+ pause_clause: float = 0.25
99
+ pause_sentence: float = 0.2
100
+ pause_paragraph: float = 0.75
101
+ pause_variance: float = 0.05
102
+ pause_mode: str = "auto"
103
+ lang: str | None = None # Language override for phonemization
104
+ chapters: list[PhonemeChapterState] = field(default_factory=list)
105
+ started_at: str = ""
106
+ last_updated: str = ""
107
+ # Track selected chapters (0-based indices)
108
+ selected_chapters: list[int] = field(default_factory=list)
109
+
110
+ def get_completed_count(self) -> int:
111
+ """Get number of completed chapters."""
112
+ return sum(1 for ch in self.chapters if ch.completed)
113
+
114
+ @classmethod
115
+ def load(cls, state_file: Path) -> Optional["PhonemeConversionState"]:
116
+ """Load state from a JSON file."""
117
+ if not state_file.exists():
118
+ return None
119
+ try:
120
+ with open(state_file, encoding="utf-8") as f:
121
+ data = json.load(f)
122
+
123
+ # Reconstruct PhonemeChapterState objects
124
+ chapters = [PhonemeChapterState(**ch) for ch in data.get("chapters", [])]
125
+ data["chapters"] = chapters
126
+
127
+ # Handle missing fields for backward compatibility
128
+ if "silence_between_chapters" not in data:
129
+ data["silence_between_chapters"] = 2.0
130
+ if "selected_chapters" not in data:
131
+ data["selected_chapters"] = []
132
+
133
+ # Migrate old pause parameters to new system
134
+ if "segment_pause_min" in data or "segment_pause_max" in data:
135
+ seg_min = data.get("segment_pause_min", 0.1)
136
+ seg_max = data.get("segment_pause_max", 0.3)
137
+ data["pause_sentence"] = (seg_min + seg_max) / 2.0
138
+ if "pause_variance" not in data:
139
+ data["pause_variance"] = max(0.01, (seg_max - seg_min) / 4.0)
140
+
141
+ if "paragraph_pause_min" in data or "paragraph_pause_max" in data:
142
+ para_min = data.get("paragraph_pause_min", 0.5)
143
+ para_max = data.get("paragraph_pause_max", 1.0)
144
+ data["pause_paragraph"] = (para_min + para_max) / 2.0
145
+
146
+ # Set defaults for new parameters
147
+ if "pause_clause" not in data:
148
+ data["pause_clause"] = 0.25
149
+ if "pause_sentence" not in data:
150
+ data["pause_sentence"] = 0.2
151
+ if "pause_paragraph" not in data:
152
+ data["pause_paragraph"] = 0.75
153
+ if "pause_variance" not in data:
154
+ data["pause_variance"] = 0.05
155
+ if "pause_mode" not in data:
156
+ data["pause_mode"] = "auto"
157
+ if "lang" not in data:
158
+ data["lang"] = None
159
+
160
+ return cls(**data)
161
+ except (json.JSONDecodeError, TypeError, KeyError):
162
+ return None
163
+
164
+ def save(self, state_file: Path) -> None:
165
+ """Save state to a JSON file."""
166
+ self.last_updated = time.strftime("%Y-%m-%d %H:%M:%S")
167
+ data = {
168
+ "version": self.version,
169
+ "source_file": self.source_file,
170
+ "output_file": self.output_file,
171
+ "work_dir": self.work_dir,
172
+ "voice": self.voice,
173
+ "speed": self.speed,
174
+ "output_format": self.output_format,
175
+ "silence_between_chapters": self.silence_between_chapters,
176
+ "pause_clause": self.pause_clause,
177
+ "pause_sentence": self.pause_sentence,
178
+ "pause_paragraph": self.pause_paragraph,
179
+ "pause_variance": self.pause_variance,
180
+ "pause_mode": self.pause_mode,
181
+ "lang": self.lang,
182
+ "chapters": [
183
+ {
184
+ "index": ch.index,
185
+ "title": ch.title,
186
+ "segment_count": ch.segment_count,
187
+ "completed": ch.completed,
188
+ "audio_file": ch.audio_file,
189
+ "duration": ch.duration,
190
+ }
191
+ for ch in self.chapters
192
+ ],
193
+ "started_at": self.started_at,
194
+ "last_updated": self.last_updated,
195
+ "selected_chapters": self.selected_chapters,
196
+ }
197
+ atomic_write_json(state_file, data, indent=2, ensure_ascii=True)
198
+
199
+
200
+ @dataclass
201
+ class PhonemeConversionOptions:
202
+ """Options for phoneme-based TTS conversion."""
203
+
204
+ voice: str = "af_heart"
205
+ speed: float = 1.0
206
+ output_format: str = "m4b"
207
+ use_gpu: bool = False
208
+ silence_between_chapters: float = 2.0
209
+ # Language override for phonemization (e.g., 'de', 'en-us', 'fr')
210
+ # If None, language from PhonemeSegments is used
211
+ lang: str | None = None
212
+ # Pause settings (pykokoro built-in pause handling)
213
+ pause_clause: float = 0.25 # For clause boundaries (commas)
214
+ pause_sentence: float = 0.2 # For sentence boundaries
215
+ pause_paragraph: float = 0.75 # For paragraph boundaries
216
+ pause_variance: float = 0.05 # Standard deviation for natural variation
217
+ pause_mode: str = "auto" # "tts", "manual", or "auto"
218
+ # Chapter announcement settings
219
+ announce_chapters: bool = True # Read chapter titles aloud before content
220
+ chapter_pause_after_title: float = 2.0 # Pause after chapter title (seconds)
221
+ # Metadata for m4b
222
+ title: str | None = None
223
+ author: str | None = None
224
+ cover_image: Path | None = None
225
+ # Voice blending (e.g., "af_nicole:50,am_michael:50")
226
+ voice_blend: str | None = None
227
+ # Voice database for custom/synthetic voices
228
+ voice_database: Path | None = None
229
+ # Chapter selection (e.g., "1-5" or "3,5,7") - 1-based
230
+ chapters: str | None = None
231
+ # Resume capability
232
+ resume: bool = True
233
+ # Keep chapter files after merge
234
+ keep_chapter_files: bool = False
235
+ # Filename template for chapter files
236
+ chapter_filename_template: str = "{chapter_num:03d}_{book_title}_{chapter_title}"
237
+ # Custom ONNX model path (None = use default downloaded model)
238
+ model_path: Path | None = None
239
+ # Custom voices.bin path (None = use default downloaded voices)
240
+ voices_path: Path | None = None
241
+
242
+
243
+ class PhonemeConverter:
244
+ """Converts PhonemeBook to audio using pre-tokenized phonemes/tokens."""
245
+
246
+ def __init__(
247
+ self,
248
+ book: PhonemeBook,
249
+ options: PhonemeConversionOptions,
250
+ progress_callback: Callable[[PhonemeConversionProgress], None] | None = None,
251
+ log_callback: Callable[[str, str], None] | None = None,
252
+ ) -> None:
253
+ """
254
+ Initialize the phoneme converter.
255
+
256
+ Args:
257
+ book: PhonemeBook to convert
258
+ options: Conversion options
259
+ progress_callback: Called with progress updates
260
+ log_callback: Called with log messages (message, level)
261
+ """
262
+ self.book = book
263
+ self.options = options
264
+ self.progress_callback = progress_callback
265
+ self.log_callback = log_callback
266
+ self._cancel_event = threading.Event()
267
+ self._runner: KokoroRunner | None = None
268
+ self._merger = AudioMerger(log=self.log)
269
+
270
+ @property
271
+ def _cancelled(self) -> bool:
272
+ return self._cancel_event.is_set()
273
+
274
+ def log(self, message: str, level: str = "info") -> None:
275
+ """Log a message."""
276
+ if self.log_callback:
277
+ self.log_callback(message, level)
278
+
279
+ def cancel(self) -> None:
280
+ """Request cancellation of the conversion."""
281
+ self._cancel_event.set()
282
+
283
+ def _phoneme_segments_to_ssmd(self, segments: list[PhonemeSegment]) -> str:
284
+ """Build SSMD text from phoneme segments."""
285
+ parts: list[str] = []
286
+ for idx, segment in enumerate(segments):
287
+ phonemes = segment.phonemes.strip()
288
+ if not phonemes:
289
+ continue
290
+ parts.append(phonemes)
291
+ if idx >= len(segments) - 1:
292
+ continue
293
+ next_segment = segments[idx + 1]
294
+ strength = "p" if next_segment.paragraph != segment.paragraph else "s"
295
+ parts.append(f"...{strength}")
296
+ parts.append("\n" if strength == "p" else " ")
297
+ return "".join(parts).strip()
298
+
299
+ def _generate_silence(self, duration: float) -> np.ndarray:
300
+ """Generate silence audio of given duration."""
301
+ samples = int(duration * SAMPLE_RATE)
302
+ return np.zeros(samples, dtype="float32")
303
+
304
+ def _setup_output(
305
+ self, output_path: Path
306
+ ) -> tuple[sf.SoundFile | None, subprocess.Popen[bytes] | None]:
307
+ """Set up output file or ffmpeg process based on format."""
308
+ fmt = self.options.output_format
309
+
310
+ if fmt == "wav":
311
+ out_file = sf.SoundFile(
312
+ str(output_path),
313
+ "w",
314
+ samplerate=SAMPLE_RATE,
315
+ channels=1,
316
+ format=fmt,
317
+ )
318
+ return out_file, None
319
+
320
+ # Formats requiring ffmpeg
321
+ ffmpeg = get_ffmpeg_path()
322
+
323
+ cmd = [
324
+ ffmpeg,
325
+ "-y",
326
+ "-thread_queue_size",
327
+ "32768",
328
+ "-f",
329
+ "f32le",
330
+ "-ar",
331
+ str(SAMPLE_RATE),
332
+ "-ac",
333
+ "1",
334
+ "-i",
335
+ "pipe:0",
336
+ ]
337
+
338
+ if fmt == "m4b":
339
+ # Add cover image if provided
340
+ if self.options.cover_image and self.options.cover_image.exists():
341
+ cmd.extend(
342
+ [
343
+ "-i",
344
+ str(self.options.cover_image),
345
+ "-map",
346
+ "0:a",
347
+ "-map",
348
+ "1",
349
+ "-c:v",
350
+ "copy",
351
+ "-disposition:v",
352
+ "attached_pic",
353
+ ]
354
+ )
355
+ cmd.extend(
356
+ [
357
+ "-c:a",
358
+ "aac",
359
+ "-q:a",
360
+ "2",
361
+ "-movflags",
362
+ "+faststart+use_metadata_tags",
363
+ ]
364
+ )
365
+ # Add metadata
366
+ if self.options.title:
367
+ cmd.extend(["-metadata", f"title={self.options.title}"])
368
+ if self.options.author:
369
+ cmd.extend(["-metadata", f"artist={self.options.author}"])
370
+ elif fmt == "opus":
371
+ cmd.extend(["-c:a", "libopus", "-b:a", "24000"])
372
+
373
+ cmd.append(str(output_path))
374
+
375
+ ffmpeg_proc = cast(
376
+ subprocess.Popen[bytes],
377
+ create_process(
378
+ cmd, stdin=subprocess.PIPE, text=False, suppress_output=True
379
+ ),
380
+ )
381
+ return None, ffmpeg_proc
382
+
383
+ def _finalize_output(
384
+ self,
385
+ out_file: sf.SoundFile | None,
386
+ ffmpeg_proc: subprocess.Popen[bytes] | None,
387
+ ) -> None:
388
+ """Finalize and close output file/process."""
389
+ if out_file is not None:
390
+ out_file.close()
391
+ elif ffmpeg_proc is not None:
392
+ if ffmpeg_proc.stdin is not None:
393
+ ffmpeg_proc.stdin.close()
394
+ ffmpeg_proc.wait()
395
+
396
+ def _write_audio_chunk(
397
+ self,
398
+ audio: np.ndarray,
399
+ out_file: sf.SoundFile | None,
400
+ ffmpeg_proc: subprocess.Popen[bytes] | None,
401
+ ) -> None:
402
+ """Write audio chunk to file or ffmpeg process."""
403
+ if out_file is not None:
404
+ out_file.write(audio)
405
+ elif ffmpeg_proc is not None and ffmpeg_proc.stdin is not None:
406
+ audio_bytes = audio.astype("float32").tobytes()
407
+ ffmpeg_proc.stdin.write(audio_bytes)
408
+
409
+ def _convert_chapter_to_wav(
410
+ self,
411
+ chapter: PhonemeChapter,
412
+ output_file: Path,
413
+ progress: PhonemeConversionProgress | None = None,
414
+ start_time: float | None = None,
415
+ segments_before: int = 0,
416
+ ) -> tuple[float, int]:
417
+ """
418
+ Convert a single chapter to a WAV file.
419
+
420
+ Args:
421
+ chapter: PhonemeChapter to convert
422
+ output_file: Output WAV file path
423
+ progress: Optional progress object to update
424
+ start_time: Conversion start time for ETA calculation
425
+ segments_before: Segments processed before this chapter
426
+
427
+ Returns:
428
+ Tuple of (duration in seconds, segments processed)
429
+ """
430
+ segments_processed = 0
431
+ total_segments = len(chapter.segments)
432
+ assert self._runner is not None
433
+ lang_code = (
434
+ get_onnx_lang_code(self.options.lang)
435
+ if self.options.lang
436
+ else (chapter.segments[0].lang if chapter.segments else "en-us")
437
+ )
438
+
439
+ # Open WAV file for writing
440
+ with sf.SoundFile(
441
+ str(output_file),
442
+ "w",
443
+ samplerate=SAMPLE_RATE,
444
+ channels=1,
445
+ format="wav",
446
+ ) as out_file:
447
+ duration = 0.0
448
+
449
+ # Announce chapter title if enabled
450
+ # Only announce if there are segments to follow
451
+ if self.options.announce_chapters and chapter.title and chapter.segments:
452
+ title_samples = self._runner.synthesize(
453
+ chapter.title,
454
+ lang_code=lang_code,
455
+ pause_mode="tts",
456
+ is_phonemes=False,
457
+ )
458
+ out_file.write(title_samples)
459
+ duration += len(title_samples) / SAMPLE_RATE
460
+
461
+ # Add pause after chapter title
462
+ pause_duration = self.options.chapter_pause_after_title
463
+ if pause_duration > 0:
464
+ pause_samples = int(pause_duration * SAMPLE_RATE)
465
+ pause_audio = np.zeros(pause_samples, dtype=np.float32)
466
+ out_file.write(pause_audio)
467
+ duration += pause_duration
468
+
469
+ if not self._cancel_event.is_set() and chapter.segments:
470
+ # Single pipeline call for entire chapter
471
+ ssmd_text = self._phoneme_segments_to_ssmd(chapter.segments)
472
+ samples = self._runner.synthesize(
473
+ ssmd_text,
474
+ lang_code=lang_code,
475
+ pause_mode=cast(
476
+ Literal["tts", "manual", "auto"], self.options.pause_mode
477
+ ),
478
+ is_phonemes=True,
479
+ )
480
+
481
+ out_file.write(samples)
482
+ duration += len(samples) / SAMPLE_RATE
483
+ segments_processed = total_segments
484
+
485
+ # Update progress once per chapter
486
+ if progress and self.progress_callback:
487
+ progress.current_segment = segments_processed
488
+ progress.segments_processed = segments_before + segments_processed
489
+ ch_title = chapter.title or "chapter"
490
+ progress.current_text = (
491
+ f"Completed {ch_title} ({segments_processed} segments)"
492
+ )
493
+ if start_time and progress.total_segments_all > 0:
494
+ elapsed = time.time() - start_time
495
+ if progress.segments_processed > 0 and elapsed > 0.5:
496
+ avg_time = elapsed / progress.segments_processed
497
+ remaining = (
498
+ progress.total_segments_all
499
+ - progress.segments_processed
500
+ )
501
+ progress.estimated_remaining = avg_time * remaining
502
+ progress.elapsed_time = elapsed
503
+ self.progress_callback(progress)
504
+
505
+ return duration, segments_processed
506
+
507
+ def _get_selected_chapters(self) -> list[PhonemeChapter]:
508
+ """Get chapters based on selection option."""
509
+ if not self.options.chapters:
510
+ return list(self.book.chapters)
511
+
512
+ indices = parse_chapter_selection(
513
+ self.options.chapters, len(self.book.chapters)
514
+ )
515
+ return [self.book.chapters[i] for i in indices]
516
+
517
+ def _get_selected_indices(self) -> list[int]:
518
+ """Get 0-based chapter indices based on selection option."""
519
+ if not self.options.chapters:
520
+ return list(range(len(self.book.chapters)))
521
+
522
+ return parse_chapter_selection(self.options.chapters, len(self.book.chapters))
523
+
524
+ def convert(self, output_path: Path) -> PhonemeConversionResult:
525
+ """
526
+ Convert PhonemeBook to audio with resume capability.
527
+
528
+ Each chapter is saved as a separate WAV file, allowing conversion
529
+ to be resumed if interrupted. A state file tracks progress.
530
+
531
+ Args:
532
+ output_path: Output file path
533
+
534
+ Returns:
535
+ PhonemeConversionResult with success status and paths
536
+ """
537
+ selected_chapters = self._get_selected_chapters()
538
+ selected_indices = self._get_selected_indices()
539
+
540
+ if not selected_chapters:
541
+ return PhonemeConversionResult(
542
+ success=False, error_message="No chapters to convert"
543
+ )
544
+
545
+ if self.options.output_format not in SUPPORTED_OUTPUT_FORMATS:
546
+ return PhonemeConversionResult(
547
+ success=False,
548
+ error_message=f"Unsupported format: {self.options.output_format}",
549
+ )
550
+
551
+ self._cancel_event.clear()
552
+ prevent_sleep_start()
553
+
554
+ try:
555
+ # Set up work directory for chapter files (use book title)
556
+ safe_book_title = sanitize_filename(
557
+ self.options.title or self.book.title or output_path.stem
558
+ )[:50]
559
+ work_dir = output_path.parent / f".{safe_book_title}_chapters"
560
+ work_dir.mkdir(parents=True, exist_ok=True)
561
+ state_file = work_dir / f"{safe_book_title}_state.json"
562
+
563
+ # Load or create state
564
+ state: PhonemeConversionState | None = None
565
+ if self.options.resume and state_file.exists():
566
+ state = PhonemeConversionState.load(state_file)
567
+ if state:
568
+ # Check if selected chapters match
569
+ if state.selected_chapters != selected_indices:
570
+ self.log(
571
+ "Chapter selection changed, starting fresh conversion",
572
+ "warning",
573
+ )
574
+ state = None
575
+ # Check if settings differ from saved state
576
+ elif (
577
+ state.voice != self.options.voice
578
+ or state.speed != self.options.speed
579
+ or state.silence_between_chapters
580
+ != self.options.silence_between_chapters
581
+ or state.pause_clause != self.options.pause_clause
582
+ or state.pause_sentence != self.options.pause_sentence
583
+ or state.pause_paragraph != self.options.pause_paragraph
584
+ or state.pause_variance != self.options.pause_variance
585
+ or state.pause_mode != self.options.pause_mode
586
+ ):
587
+ self.log(
588
+ f"Restoring settings from previous session: "
589
+ f"voice={state.voice}, speed={state.speed}, "
590
+ f"silence={state.silence_between_chapters}s, "
591
+ f"pause_clause={state.pause_clause}s, "
592
+ f"pause_sentence={state.pause_sentence}s, "
593
+ f"pause_paragraph={state.pause_paragraph}s, "
594
+ f"pause_variance={state.pause_variance}s, "
595
+ f"pause_mode={state.pause_mode}",
596
+ "info",
597
+ )
598
+ # Apply saved settings for consistency
599
+ self.options.voice = state.voice
600
+ self.options.speed = state.speed
601
+ self.options.output_format = state.output_format
602
+ self.options.silence_between_chapters = (
603
+ state.silence_between_chapters
604
+ )
605
+ self.options.pause_clause = state.pause_clause
606
+ self.options.pause_sentence = state.pause_sentence
607
+ self.options.pause_paragraph = state.pause_paragraph
608
+ self.options.pause_variance = state.pause_variance
609
+ self.options.pause_mode = state.pause_mode
610
+
611
+ if state is None:
612
+ # Create new state
613
+ state = PhonemeConversionState(
614
+ source_file=str(self.book.title),
615
+ output_file=str(output_path),
616
+ work_dir=str(work_dir),
617
+ voice=self.options.voice,
618
+ speed=self.options.speed,
619
+ output_format=self.options.output_format,
620
+ silence_between_chapters=self.options.silence_between_chapters,
621
+ pause_clause=self.options.pause_clause,
622
+ pause_sentence=self.options.pause_sentence,
623
+ pause_paragraph=self.options.pause_paragraph,
624
+ pause_variance=self.options.pause_variance,
625
+ pause_mode=self.options.pause_mode,
626
+ chapters=[
627
+ PhonemeChapterState(
628
+ index=idx,
629
+ title=self.book.chapters[idx].title,
630
+ segment_count=len(self.book.chapters[idx].segments),
631
+ )
632
+ for idx in selected_indices
633
+ ],
634
+ started_at=time.strftime("%Y-%m-%d %H:%M:%S"),
635
+ selected_chapters=selected_indices,
636
+ )
637
+ state.save(state_file)
638
+ else:
639
+ completed = state.get_completed_count()
640
+ total = len(selected_chapters)
641
+ self.log(f"Resuming conversion: {completed}/{total} chapters completed")
642
+
643
+ opts = KokoroRunOptions(
644
+ voice=self.options.voice,
645
+ speed=self.options.speed,
646
+ use_gpu=self.options.use_gpu,
647
+ pause_clause=self.options.pause_clause,
648
+ pause_sentence=self.options.pause_sentence,
649
+ pause_paragraph=self.options.pause_paragraph,
650
+ pause_variance=self.options.pause_variance,
651
+ model_path=self.options.model_path,
652
+ voices_path=self.options.voices_path,
653
+ voice_blend=self.options.voice_blend,
654
+ voice_database=self.options.voice_database,
655
+ )
656
+ self._runner = KokoroRunner(opts, log=self.log)
657
+ self._runner.ensure_ready()
658
+
659
+ total_segments = sum(len(ch.segments) for ch in selected_chapters)
660
+ # Account for already completed chapters
661
+ segments_already_done = sum(
662
+ state.chapters[i].segment_count
663
+ for i in range(len(state.chapters))
664
+ if state.chapters[i].completed
665
+ )
666
+ segments_processed = segments_already_done
667
+ start_time = time.time()
668
+
669
+ progress = PhonemeConversionProgress(
670
+ total_chapters=len(selected_chapters),
671
+ total_segments_all=total_segments,
672
+ segments_processed=segments_processed,
673
+ )
674
+
675
+ # Convert each chapter
676
+ for state_idx, chapter_state in enumerate(state.chapters):
677
+ if self._cancel_event.is_set():
678
+ state.save(state_file)
679
+ return PhonemeConversionResult(
680
+ success=False,
681
+ error_message="Cancelled",
682
+ chapters_dir=work_dir,
683
+ )
684
+
685
+ chapter_idx = chapter_state.index
686
+ chapter = self.book.chapters[chapter_idx]
687
+
688
+ # Skip already completed chapters
689
+ if chapter_state.completed and chapter_state.audio_file:
690
+ chapter_file = work_dir / chapter_state.audio_file
691
+ if chapter_file.exists():
692
+ ch_num = state_idx + 1
693
+ self.log(
694
+ f"Skipping completed chapter {ch_num}: {chapter.title}"
695
+ )
696
+ continue
697
+ else:
698
+ # File missing, need to reconvert
699
+ chapter_state.completed = False
700
+
701
+ progress.current_chapter = state_idx + 1
702
+ progress.chapter_name = chapter.title
703
+ progress.total_segments = len(chapter.segments)
704
+ progress.current_segment = 0
705
+
706
+ ch_num = state_idx + 1
707
+ total_ch = len(state.chapters)
708
+ self.log(f"Converting chapter {ch_num}/{total_ch}: {chapter.title}")
709
+
710
+ # Generate chapter filename using template
711
+ chapter_filename = (
712
+ format_filename_template(
713
+ self.options.chapter_filename_template,
714
+ book_title=self.options.title or self.book.title or "Untitled",
715
+ chapter_title=chapter.title,
716
+ chapter_num=state_idx + 1,
717
+ )
718
+ + ".wav"
719
+ )
720
+ chapter_file = work_dir / chapter_filename
721
+
722
+ # Convert chapter to WAV
723
+ duration, segs_done = self._convert_chapter_to_wav(
724
+ chapter,
725
+ chapter_file,
726
+ progress=progress,
727
+ start_time=start_time,
728
+ segments_before=segments_processed,
729
+ )
730
+
731
+ if self._cancel_event.is_set():
732
+ # Remove incomplete file
733
+ chapter_file.unlink(missing_ok=True)
734
+ state.save(state_file)
735
+ return PhonemeConversionResult(
736
+ success=False,
737
+ error_message="Cancelled",
738
+ chapters_dir=work_dir,
739
+ )
740
+
741
+ # Update state
742
+ chapter_state.completed = True
743
+ chapter_state.audio_file = chapter_filename
744
+ chapter_state.duration = duration
745
+ state.save(state_file)
746
+
747
+ # Update progress
748
+ segments_processed += segs_done
749
+ progress.segments_processed = segments_processed
750
+ elapsed = time.time() - start_time
751
+ if segments_processed > segments_already_done and elapsed > 0.5:
752
+ segs_in_session = segments_processed - segments_already_done
753
+ avg_time = elapsed / segs_in_session
754
+ remaining = total_segments - segments_processed
755
+ progress.estimated_remaining = avg_time * remaining
756
+ progress.elapsed_time = elapsed
757
+
758
+ if self.progress_callback:
759
+ self.progress_callback(progress)
760
+
761
+ # All chapters completed, merge into final output
762
+ self.log("Merging chapters into final audiobook...")
763
+
764
+ chapter_files = [
765
+ work_dir / ch.audio_file for ch in state.chapters if ch.audio_file
766
+ ]
767
+ chapter_durations = [ch.duration for ch in state.chapters]
768
+ chapter_titles = [ch.title for ch in state.chapters]
769
+
770
+ meta = MergeMeta(
771
+ fmt=self.options.output_format,
772
+ silence_between_chapters=self.options.silence_between_chapters,
773
+ title=self.options.title,
774
+ author=self.options.author,
775
+ cover_image=self.options.cover_image,
776
+ )
777
+ self._merger.merge_chapter_wavs(
778
+ chapter_files,
779
+ chapter_durations,
780
+ chapter_titles,
781
+ output_path,
782
+ meta,
783
+ )
784
+
785
+ total_duration = sum(chapter_durations)
786
+ self.log(
787
+ f"Conversion complete! Duration: {format_duration(total_duration)}"
788
+ )
789
+
790
+ # Clean up work directory if not keeping chapter files
791
+ if not self.options.keep_chapter_files:
792
+ for f in work_dir.iterdir():
793
+ f.unlink()
794
+ work_dir.rmdir()
795
+ work_dir = None # type: ignore
796
+
797
+ return PhonemeConversionResult(
798
+ success=True,
799
+ output_path=output_path,
800
+ chapters_dir=work_dir,
801
+ duration=total_duration,
802
+ )
803
+
804
+ except Exception as e:
805
+ import traceback
806
+
807
+ error_msg = f"{str(e)}\n\nTraceback:\n{traceback.format_exc()}"
808
+ return PhonemeConversionResult(success=False, error_message=error_msg)
809
+ finally:
810
+ prevent_sleep_end()
811
+
812
+ def convert_streaming(self, output_path: Path) -> PhonemeConversionResult:
813
+ """
814
+ Convert PhonemeBook to audio in streaming mode.
815
+
816
+ Audio is written directly to the output file/process without
817
+ intermediate chapter files. This is faster but doesn't support
818
+ resume capability.
819
+
820
+ Args:
821
+ output_path: Output file path
822
+
823
+ Returns:
824
+ PhonemeConversionResult with success status and paths
825
+ """
826
+ selected_chapters = self._get_selected_chapters()
827
+
828
+ if not selected_chapters:
829
+ return PhonemeConversionResult(
830
+ success=False, error_message="No chapters to convert"
831
+ )
832
+
833
+ if self.options.output_format not in SUPPORTED_OUTPUT_FORMATS:
834
+ return PhonemeConversionResult(
835
+ success=False,
836
+ error_message=f"Unsupported format: {self.options.output_format}",
837
+ )
838
+
839
+ self._cancel_event.clear()
840
+ prevent_sleep_start()
841
+
842
+ try:
843
+ opts = KokoroRunOptions(
844
+ voice=self.options.voice,
845
+ speed=self.options.speed,
846
+ use_gpu=self.options.use_gpu,
847
+ pause_clause=self.options.pause_clause,
848
+ pause_sentence=self.options.pause_sentence,
849
+ pause_paragraph=self.options.pause_paragraph,
850
+ pause_variance=self.options.pause_variance,
851
+ model_path=self.options.model_path,
852
+ voices_path=self.options.voices_path,
853
+ voice_blend=self.options.voice_blend,
854
+ voice_database=self.options.voice_database,
855
+ )
856
+ self._runner = KokoroRunner(opts, log=self.log)
857
+ self._runner.ensure_ready()
858
+
859
+ total_segments = sum(len(ch.segments) for ch in selected_chapters)
860
+ segments_processed = 0
861
+ start_time = time.time()
862
+ current_time = 0.0
863
+ chapter_times: list[dict[str, Any]] = []
864
+
865
+ progress = PhonemeConversionProgress(
866
+ total_chapters=len(selected_chapters),
867
+ total_segments_all=total_segments,
868
+ )
869
+
870
+ # Set up output
871
+ out_file, ffmpeg_proc = self._setup_output(output_path)
872
+
873
+ for chapter_idx, chapter in enumerate(selected_chapters):
874
+ if self._cancel_event.is_set():
875
+ break
876
+
877
+ progress.current_chapter = chapter_idx + 1
878
+ progress.chapter_name = chapter.title
879
+ progress.total_segments = len(chapter.segments)
880
+ progress.current_segment = 0
881
+
882
+ ch_num = chapter_idx + 1
883
+ total_ch = len(selected_chapters)
884
+ self.log(f"Converting chapter {ch_num}/{total_ch}: {chapter.title}")
885
+
886
+ chapter_start = current_time
887
+
888
+ total_chapter_segments = len(chapter.segments)
889
+ if not self._cancel_event.is_set() and chapter.segments:
890
+ assert self._runner is not None
891
+ lang_code = (
892
+ get_onnx_lang_code(self.options.lang)
893
+ if self.options.lang
894
+ else (chapter.segments[0].lang if chapter.segments else "en-us")
895
+ )
896
+ ssmd_text = self._phoneme_segments_to_ssmd(chapter.segments)
897
+ samples = self._runner.synthesize(
898
+ ssmd_text,
899
+ lang_code=lang_code,
900
+ pause_mode=cast(
901
+ Literal["tts", "manual", "auto"],
902
+ self.options.pause_mode,
903
+ ),
904
+ is_phonemes=True,
905
+ )
906
+
907
+ self._write_audio_chunk(samples, out_file, ffmpeg_proc)
908
+ current_time += len(samples) / SAMPLE_RATE
909
+ segments_processed += total_chapter_segments
910
+
911
+ # Update progress once per chapter
912
+ progress.current_segment = total_chapter_segments
913
+ progress.segments_processed = segments_processed
914
+ progress.current_text = (
915
+ f"Completed {chapter.title} ({total_chapter_segments} segments)"
916
+ )
917
+ if segments_processed > 0:
918
+ elapsed = time.time() - start_time
919
+ if elapsed > 0.5:
920
+ avg_time = elapsed / segments_processed
921
+ remaining = total_segments - segments_processed
922
+ progress.estimated_remaining = avg_time * remaining
923
+ progress.elapsed_time = elapsed
924
+
925
+ if self.progress_callback:
926
+ self.progress_callback(progress)
927
+
928
+ # Record chapter timing
929
+ chapter_times.append(
930
+ {
931
+ "title": chapter.title,
932
+ "start": chapter_start,
933
+ "end": current_time,
934
+ }
935
+ )
936
+
937
+ # Add silence between chapters
938
+ if (
939
+ chapter_idx < len(selected_chapters) - 1
940
+ and self.options.silence_between_chapters > 0
941
+ ):
942
+ silence = self._generate_silence(
943
+ self.options.silence_between_chapters
944
+ )
945
+ self._write_audio_chunk(silence, out_file, ffmpeg_proc)
946
+ current_time += self.options.silence_between_chapters
947
+
948
+ # Finalize output
949
+ self._finalize_output(out_file, ffmpeg_proc)
950
+
951
+ if self._cancel_event.is_set():
952
+ # Clean up partial file
953
+ output_path.unlink(missing_ok=True)
954
+ return PhonemeConversionResult(success=False, error_message="Cancelled")
955
+
956
+ # Add chapter markers for m4b
957
+ if self.options.output_format == "m4b" and len(chapter_times) > 1:
958
+ self._merger.add_chapters_to_m4b(
959
+ output_path,
960
+ chapter_times,
961
+ self.options.cover_image,
962
+ )
963
+
964
+ self.log(f"Conversion complete! Duration: {format_duration(current_time)}")
965
+
966
+ return PhonemeConversionResult(
967
+ success=True,
968
+ output_path=output_path,
969
+ duration=current_time,
970
+ )
971
+
972
+ except Exception as e:
973
+ import traceback
974
+
975
+ error_msg = f"{str(e)}\n\nTraceback:\n{traceback.format_exc()}"
976
+ return PhonemeConversionResult(success=False, error_message=error_msg)
977
+ finally:
978
+ prevent_sleep_end()