ttsforge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ttsforge/__init__.py +114 -0
- ttsforge/_version.py +34 -0
- ttsforge/audio_merge.py +180 -0
- ttsforge/audio_player.py +473 -0
- ttsforge/chapter_selection.py +75 -0
- ttsforge/cli/__init__.py +73 -0
- ttsforge/cli/commands_conversion.py +1927 -0
- ttsforge/cli/commands_phonemes.py +1033 -0
- ttsforge/cli/commands_utility.py +1389 -0
- ttsforge/cli/helpers.py +76 -0
- ttsforge/constants.py +164 -0
- ttsforge/conversion.py +1090 -0
- ttsforge/input_reader.py +408 -0
- ttsforge/kokoro_lang.py +12 -0
- ttsforge/kokoro_runner.py +125 -0
- ttsforge/name_extractor.py +305 -0
- ttsforge/phoneme_conversion.py +978 -0
- ttsforge/phonemes.py +486 -0
- ttsforge/ssmd_generator.py +422 -0
- ttsforge/utils.py +785 -0
- ttsforge/vocab/__init__.py +139 -0
- ttsforge-0.1.0.dist-info/METADATA +659 -0
- ttsforge-0.1.0.dist-info/RECORD +27 -0
- ttsforge-0.1.0.dist-info/WHEEL +5 -0
- ttsforge-0.1.0.dist-info/entry_points.txt +2 -0
- ttsforge-0.1.0.dist-info/licenses/LICENSE +21 -0
- ttsforge-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,978 @@
|
|
|
1
|
+
"""Phoneme-based TTS conversion module for ttsforge.
|
|
2
|
+
|
|
3
|
+
This module converts pre-tokenized PhonemeBook files to audio,
|
|
4
|
+
bypassing text-to-phoneme conversion since phonemes/tokens are pre-computed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import subprocess
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Literal, Optional, cast
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import soundfile as sf
|
|
18
|
+
|
|
19
|
+
from .audio_merge import AudioMerger, MergeMeta
|
|
20
|
+
from .chapter_selection import parse_chapter_selection
|
|
21
|
+
from .constants import SAMPLE_RATE, SUPPORTED_OUTPUT_FORMATS
|
|
22
|
+
from .kokoro_lang import get_onnx_lang_code
|
|
23
|
+
from .kokoro_runner import KokoroRunner, KokoroRunOptions
|
|
24
|
+
from .phonemes import PhonemeBook, PhonemeChapter, PhonemeSegment
|
|
25
|
+
from .utils import (
|
|
26
|
+
atomic_write_json,
|
|
27
|
+
create_process,
|
|
28
|
+
format_duration,
|
|
29
|
+
format_filename_template,
|
|
30
|
+
get_ffmpeg_path,
|
|
31
|
+
prevent_sleep_end,
|
|
32
|
+
prevent_sleep_start,
|
|
33
|
+
sanitize_filename,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class PhonemeConversionProgress:
|
|
39
|
+
"""Progress information during phoneme conversion."""
|
|
40
|
+
|
|
41
|
+
current_chapter: int = 0
|
|
42
|
+
total_chapters: int = 0
|
|
43
|
+
chapter_name: str = ""
|
|
44
|
+
current_segment: int = 0
|
|
45
|
+
total_segments: int = 0
|
|
46
|
+
segments_processed: int = 0 # Global segment count
|
|
47
|
+
total_segments_all: int = 0 # Total segments across all chapters
|
|
48
|
+
current_text: str = ""
|
|
49
|
+
elapsed_time: float = 0.0
|
|
50
|
+
estimated_remaining: float = 0.0
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def percent(self) -> int:
|
|
54
|
+
if self.total_segments_all == 0:
|
|
55
|
+
return 0
|
|
56
|
+
return min(int(self.segments_processed / self.total_segments_all * 100), 99)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def etr_formatted(self) -> str:
|
|
60
|
+
return format_duration(self.estimated_remaining)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class PhonemeConversionResult:
|
|
65
|
+
"""Result of a phoneme conversion operation."""
|
|
66
|
+
|
|
67
|
+
success: bool
|
|
68
|
+
output_path: Path | None = None
|
|
69
|
+
error_message: str | None = None
|
|
70
|
+
chapters_dir: Path | None = None
|
|
71
|
+
duration: float = 0.0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class PhonemeChapterState:
|
|
76
|
+
"""State of a single chapter conversion."""
|
|
77
|
+
|
|
78
|
+
index: int
|
|
79
|
+
title: str
|
|
80
|
+
segment_count: int
|
|
81
|
+
completed: bool = False
|
|
82
|
+
audio_file: str | None = None # Relative path to chapter audio
|
|
83
|
+
duration: float = 0.0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class PhonemeConversionState:
|
|
88
|
+
"""Persistent state for resumable phoneme conversions."""
|
|
89
|
+
|
|
90
|
+
version: int = 1
|
|
91
|
+
source_file: str = ""
|
|
92
|
+
output_file: str = ""
|
|
93
|
+
work_dir: str = ""
|
|
94
|
+
voice: str = ""
|
|
95
|
+
speed: float = 1.0
|
|
96
|
+
output_format: str = "m4b"
|
|
97
|
+
silence_between_chapters: float = 2.0
|
|
98
|
+
pause_clause: float = 0.25
|
|
99
|
+
pause_sentence: float = 0.2
|
|
100
|
+
pause_paragraph: float = 0.75
|
|
101
|
+
pause_variance: float = 0.05
|
|
102
|
+
pause_mode: str = "auto"
|
|
103
|
+
lang: str | None = None # Language override for phonemization
|
|
104
|
+
chapters: list[PhonemeChapterState] = field(default_factory=list)
|
|
105
|
+
started_at: str = ""
|
|
106
|
+
last_updated: str = ""
|
|
107
|
+
# Track selected chapters (0-based indices)
|
|
108
|
+
selected_chapters: list[int] = field(default_factory=list)
|
|
109
|
+
|
|
110
|
+
def get_completed_count(self) -> int:
|
|
111
|
+
"""Get number of completed chapters."""
|
|
112
|
+
return sum(1 for ch in self.chapters if ch.completed)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def load(cls, state_file: Path) -> Optional["PhonemeConversionState"]:
|
|
116
|
+
"""Load state from a JSON file."""
|
|
117
|
+
if not state_file.exists():
|
|
118
|
+
return None
|
|
119
|
+
try:
|
|
120
|
+
with open(state_file, encoding="utf-8") as f:
|
|
121
|
+
data = json.load(f)
|
|
122
|
+
|
|
123
|
+
# Reconstruct PhonemeChapterState objects
|
|
124
|
+
chapters = [PhonemeChapterState(**ch) for ch in data.get("chapters", [])]
|
|
125
|
+
data["chapters"] = chapters
|
|
126
|
+
|
|
127
|
+
# Handle missing fields for backward compatibility
|
|
128
|
+
if "silence_between_chapters" not in data:
|
|
129
|
+
data["silence_between_chapters"] = 2.0
|
|
130
|
+
if "selected_chapters" not in data:
|
|
131
|
+
data["selected_chapters"] = []
|
|
132
|
+
|
|
133
|
+
# Migrate old pause parameters to new system
|
|
134
|
+
if "segment_pause_min" in data or "segment_pause_max" in data:
|
|
135
|
+
seg_min = data.get("segment_pause_min", 0.1)
|
|
136
|
+
seg_max = data.get("segment_pause_max", 0.3)
|
|
137
|
+
data["pause_sentence"] = (seg_min + seg_max) / 2.0
|
|
138
|
+
if "pause_variance" not in data:
|
|
139
|
+
data["pause_variance"] = max(0.01, (seg_max - seg_min) / 4.0)
|
|
140
|
+
|
|
141
|
+
if "paragraph_pause_min" in data or "paragraph_pause_max" in data:
|
|
142
|
+
para_min = data.get("paragraph_pause_min", 0.5)
|
|
143
|
+
para_max = data.get("paragraph_pause_max", 1.0)
|
|
144
|
+
data["pause_paragraph"] = (para_min + para_max) / 2.0
|
|
145
|
+
|
|
146
|
+
# Set defaults for new parameters
|
|
147
|
+
if "pause_clause" not in data:
|
|
148
|
+
data["pause_clause"] = 0.25
|
|
149
|
+
if "pause_sentence" not in data:
|
|
150
|
+
data["pause_sentence"] = 0.2
|
|
151
|
+
if "pause_paragraph" not in data:
|
|
152
|
+
data["pause_paragraph"] = 0.75
|
|
153
|
+
if "pause_variance" not in data:
|
|
154
|
+
data["pause_variance"] = 0.05
|
|
155
|
+
if "pause_mode" not in data:
|
|
156
|
+
data["pause_mode"] = "auto"
|
|
157
|
+
if "lang" not in data:
|
|
158
|
+
data["lang"] = None
|
|
159
|
+
|
|
160
|
+
return cls(**data)
|
|
161
|
+
except (json.JSONDecodeError, TypeError, KeyError):
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
def save(self, state_file: Path) -> None:
|
|
165
|
+
"""Save state to a JSON file."""
|
|
166
|
+
self.last_updated = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
167
|
+
data = {
|
|
168
|
+
"version": self.version,
|
|
169
|
+
"source_file": self.source_file,
|
|
170
|
+
"output_file": self.output_file,
|
|
171
|
+
"work_dir": self.work_dir,
|
|
172
|
+
"voice": self.voice,
|
|
173
|
+
"speed": self.speed,
|
|
174
|
+
"output_format": self.output_format,
|
|
175
|
+
"silence_between_chapters": self.silence_between_chapters,
|
|
176
|
+
"pause_clause": self.pause_clause,
|
|
177
|
+
"pause_sentence": self.pause_sentence,
|
|
178
|
+
"pause_paragraph": self.pause_paragraph,
|
|
179
|
+
"pause_variance": self.pause_variance,
|
|
180
|
+
"pause_mode": self.pause_mode,
|
|
181
|
+
"lang": self.lang,
|
|
182
|
+
"chapters": [
|
|
183
|
+
{
|
|
184
|
+
"index": ch.index,
|
|
185
|
+
"title": ch.title,
|
|
186
|
+
"segment_count": ch.segment_count,
|
|
187
|
+
"completed": ch.completed,
|
|
188
|
+
"audio_file": ch.audio_file,
|
|
189
|
+
"duration": ch.duration,
|
|
190
|
+
}
|
|
191
|
+
for ch in self.chapters
|
|
192
|
+
],
|
|
193
|
+
"started_at": self.started_at,
|
|
194
|
+
"last_updated": self.last_updated,
|
|
195
|
+
"selected_chapters": self.selected_chapters,
|
|
196
|
+
}
|
|
197
|
+
atomic_write_json(state_file, data, indent=2, ensure_ascii=True)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@dataclass
|
|
201
|
+
class PhonemeConversionOptions:
|
|
202
|
+
"""Options for phoneme-based TTS conversion."""
|
|
203
|
+
|
|
204
|
+
voice: str = "af_heart"
|
|
205
|
+
speed: float = 1.0
|
|
206
|
+
output_format: str = "m4b"
|
|
207
|
+
use_gpu: bool = False
|
|
208
|
+
silence_between_chapters: float = 2.0
|
|
209
|
+
# Language override for phonemization (e.g., 'de', 'en-us', 'fr')
|
|
210
|
+
# If None, language from PhonemeSegments is used
|
|
211
|
+
lang: str | None = None
|
|
212
|
+
# Pause settings (pykokoro built-in pause handling)
|
|
213
|
+
pause_clause: float = 0.25 # For clause boundaries (commas)
|
|
214
|
+
pause_sentence: float = 0.2 # For sentence boundaries
|
|
215
|
+
pause_paragraph: float = 0.75 # For paragraph boundaries
|
|
216
|
+
pause_variance: float = 0.05 # Standard deviation for natural variation
|
|
217
|
+
pause_mode: str = "auto" # "tts", "manual", or "auto"
|
|
218
|
+
# Chapter announcement settings
|
|
219
|
+
announce_chapters: bool = True # Read chapter titles aloud before content
|
|
220
|
+
chapter_pause_after_title: float = 2.0 # Pause after chapter title (seconds)
|
|
221
|
+
# Metadata for m4b
|
|
222
|
+
title: str | None = None
|
|
223
|
+
author: str | None = None
|
|
224
|
+
cover_image: Path | None = None
|
|
225
|
+
# Voice blending (e.g., "af_nicole:50,am_michael:50")
|
|
226
|
+
voice_blend: str | None = None
|
|
227
|
+
# Voice database for custom/synthetic voices
|
|
228
|
+
voice_database: Path | None = None
|
|
229
|
+
# Chapter selection (e.g., "1-5" or "3,5,7") - 1-based
|
|
230
|
+
chapters: str | None = None
|
|
231
|
+
# Resume capability
|
|
232
|
+
resume: bool = True
|
|
233
|
+
# Keep chapter files after merge
|
|
234
|
+
keep_chapter_files: bool = False
|
|
235
|
+
# Filename template for chapter files
|
|
236
|
+
chapter_filename_template: str = "{chapter_num:03d}_{book_title}_{chapter_title}"
|
|
237
|
+
# Custom ONNX model path (None = use default downloaded model)
|
|
238
|
+
model_path: Path | None = None
|
|
239
|
+
# Custom voices.bin path (None = use default downloaded voices)
|
|
240
|
+
voices_path: Path | None = None
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class PhonemeConverter:
|
|
244
|
+
"""Converts PhonemeBook to audio using pre-tokenized phonemes/tokens."""
|
|
245
|
+
|
|
246
|
+
def __init__(
|
|
247
|
+
self,
|
|
248
|
+
book: PhonemeBook,
|
|
249
|
+
options: PhonemeConversionOptions,
|
|
250
|
+
progress_callback: Callable[[PhonemeConversionProgress], None] | None = None,
|
|
251
|
+
log_callback: Callable[[str, str], None] | None = None,
|
|
252
|
+
) -> None:
|
|
253
|
+
"""
|
|
254
|
+
Initialize the phoneme converter.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
book: PhonemeBook to convert
|
|
258
|
+
options: Conversion options
|
|
259
|
+
progress_callback: Called with progress updates
|
|
260
|
+
log_callback: Called with log messages (message, level)
|
|
261
|
+
"""
|
|
262
|
+
self.book = book
|
|
263
|
+
self.options = options
|
|
264
|
+
self.progress_callback = progress_callback
|
|
265
|
+
self.log_callback = log_callback
|
|
266
|
+
self._cancel_event = threading.Event()
|
|
267
|
+
self._runner: KokoroRunner | None = None
|
|
268
|
+
self._merger = AudioMerger(log=self.log)
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def _cancelled(self) -> bool:
|
|
272
|
+
return self._cancel_event.is_set()
|
|
273
|
+
|
|
274
|
+
def log(self, message: str, level: str = "info") -> None:
|
|
275
|
+
"""Log a message."""
|
|
276
|
+
if self.log_callback:
|
|
277
|
+
self.log_callback(message, level)
|
|
278
|
+
|
|
279
|
+
def cancel(self) -> None:
|
|
280
|
+
"""Request cancellation of the conversion."""
|
|
281
|
+
self._cancel_event.set()
|
|
282
|
+
|
|
283
|
+
def _phoneme_segments_to_ssmd(self, segments: list[PhonemeSegment]) -> str:
|
|
284
|
+
"""Build SSMD text from phoneme segments."""
|
|
285
|
+
parts: list[str] = []
|
|
286
|
+
for idx, segment in enumerate(segments):
|
|
287
|
+
phonemes = segment.phonemes.strip()
|
|
288
|
+
if not phonemes:
|
|
289
|
+
continue
|
|
290
|
+
parts.append(phonemes)
|
|
291
|
+
if idx >= len(segments) - 1:
|
|
292
|
+
continue
|
|
293
|
+
next_segment = segments[idx + 1]
|
|
294
|
+
strength = "p" if next_segment.paragraph != segment.paragraph else "s"
|
|
295
|
+
parts.append(f"...{strength}")
|
|
296
|
+
parts.append("\n" if strength == "p" else " ")
|
|
297
|
+
return "".join(parts).strip()
|
|
298
|
+
|
|
299
|
+
def _generate_silence(self, duration: float) -> np.ndarray:
|
|
300
|
+
"""Generate silence audio of given duration."""
|
|
301
|
+
samples = int(duration * SAMPLE_RATE)
|
|
302
|
+
return np.zeros(samples, dtype="float32")
|
|
303
|
+
|
|
304
|
+
def _setup_output(
|
|
305
|
+
self, output_path: Path
|
|
306
|
+
) -> tuple[sf.SoundFile | None, subprocess.Popen[bytes] | None]:
|
|
307
|
+
"""Set up output file or ffmpeg process based on format."""
|
|
308
|
+
fmt = self.options.output_format
|
|
309
|
+
|
|
310
|
+
if fmt == "wav":
|
|
311
|
+
out_file = sf.SoundFile(
|
|
312
|
+
str(output_path),
|
|
313
|
+
"w",
|
|
314
|
+
samplerate=SAMPLE_RATE,
|
|
315
|
+
channels=1,
|
|
316
|
+
format=fmt,
|
|
317
|
+
)
|
|
318
|
+
return out_file, None
|
|
319
|
+
|
|
320
|
+
# Formats requiring ffmpeg
|
|
321
|
+
ffmpeg = get_ffmpeg_path()
|
|
322
|
+
|
|
323
|
+
cmd = [
|
|
324
|
+
ffmpeg,
|
|
325
|
+
"-y",
|
|
326
|
+
"-thread_queue_size",
|
|
327
|
+
"32768",
|
|
328
|
+
"-f",
|
|
329
|
+
"f32le",
|
|
330
|
+
"-ar",
|
|
331
|
+
str(SAMPLE_RATE),
|
|
332
|
+
"-ac",
|
|
333
|
+
"1",
|
|
334
|
+
"-i",
|
|
335
|
+
"pipe:0",
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
if fmt == "m4b":
|
|
339
|
+
# Add cover image if provided
|
|
340
|
+
if self.options.cover_image and self.options.cover_image.exists():
|
|
341
|
+
cmd.extend(
|
|
342
|
+
[
|
|
343
|
+
"-i",
|
|
344
|
+
str(self.options.cover_image),
|
|
345
|
+
"-map",
|
|
346
|
+
"0:a",
|
|
347
|
+
"-map",
|
|
348
|
+
"1",
|
|
349
|
+
"-c:v",
|
|
350
|
+
"copy",
|
|
351
|
+
"-disposition:v",
|
|
352
|
+
"attached_pic",
|
|
353
|
+
]
|
|
354
|
+
)
|
|
355
|
+
cmd.extend(
|
|
356
|
+
[
|
|
357
|
+
"-c:a",
|
|
358
|
+
"aac",
|
|
359
|
+
"-q:a",
|
|
360
|
+
"2",
|
|
361
|
+
"-movflags",
|
|
362
|
+
"+faststart+use_metadata_tags",
|
|
363
|
+
]
|
|
364
|
+
)
|
|
365
|
+
# Add metadata
|
|
366
|
+
if self.options.title:
|
|
367
|
+
cmd.extend(["-metadata", f"title={self.options.title}"])
|
|
368
|
+
if self.options.author:
|
|
369
|
+
cmd.extend(["-metadata", f"artist={self.options.author}"])
|
|
370
|
+
elif fmt == "opus":
|
|
371
|
+
cmd.extend(["-c:a", "libopus", "-b:a", "24000"])
|
|
372
|
+
|
|
373
|
+
cmd.append(str(output_path))
|
|
374
|
+
|
|
375
|
+
ffmpeg_proc = cast(
|
|
376
|
+
subprocess.Popen[bytes],
|
|
377
|
+
create_process(
|
|
378
|
+
cmd, stdin=subprocess.PIPE, text=False, suppress_output=True
|
|
379
|
+
),
|
|
380
|
+
)
|
|
381
|
+
return None, ffmpeg_proc
|
|
382
|
+
|
|
383
|
+
def _finalize_output(
|
|
384
|
+
self,
|
|
385
|
+
out_file: sf.SoundFile | None,
|
|
386
|
+
ffmpeg_proc: subprocess.Popen[bytes] | None,
|
|
387
|
+
) -> None:
|
|
388
|
+
"""Finalize and close output file/process."""
|
|
389
|
+
if out_file is not None:
|
|
390
|
+
out_file.close()
|
|
391
|
+
elif ffmpeg_proc is not None:
|
|
392
|
+
if ffmpeg_proc.stdin is not None:
|
|
393
|
+
ffmpeg_proc.stdin.close()
|
|
394
|
+
ffmpeg_proc.wait()
|
|
395
|
+
|
|
396
|
+
def _write_audio_chunk(
|
|
397
|
+
self,
|
|
398
|
+
audio: np.ndarray,
|
|
399
|
+
out_file: sf.SoundFile | None,
|
|
400
|
+
ffmpeg_proc: subprocess.Popen[bytes] | None,
|
|
401
|
+
) -> None:
|
|
402
|
+
"""Write audio chunk to file or ffmpeg process."""
|
|
403
|
+
if out_file is not None:
|
|
404
|
+
out_file.write(audio)
|
|
405
|
+
elif ffmpeg_proc is not None and ffmpeg_proc.stdin is not None:
|
|
406
|
+
audio_bytes = audio.astype("float32").tobytes()
|
|
407
|
+
ffmpeg_proc.stdin.write(audio_bytes)
|
|
408
|
+
|
|
409
|
+
def _convert_chapter_to_wav(
|
|
410
|
+
self,
|
|
411
|
+
chapter: PhonemeChapter,
|
|
412
|
+
output_file: Path,
|
|
413
|
+
progress: PhonemeConversionProgress | None = None,
|
|
414
|
+
start_time: float | None = None,
|
|
415
|
+
segments_before: int = 0,
|
|
416
|
+
) -> tuple[float, int]:
|
|
417
|
+
"""
|
|
418
|
+
Convert a single chapter to a WAV file.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
chapter: PhonemeChapter to convert
|
|
422
|
+
output_file: Output WAV file path
|
|
423
|
+
progress: Optional progress object to update
|
|
424
|
+
start_time: Conversion start time for ETA calculation
|
|
425
|
+
segments_before: Segments processed before this chapter
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
Tuple of (duration in seconds, segments processed)
|
|
429
|
+
"""
|
|
430
|
+
segments_processed = 0
|
|
431
|
+
total_segments = len(chapter.segments)
|
|
432
|
+
assert self._runner is not None
|
|
433
|
+
lang_code = (
|
|
434
|
+
get_onnx_lang_code(self.options.lang)
|
|
435
|
+
if self.options.lang
|
|
436
|
+
else (chapter.segments[0].lang if chapter.segments else "en-us")
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Open WAV file for writing
|
|
440
|
+
with sf.SoundFile(
|
|
441
|
+
str(output_file),
|
|
442
|
+
"w",
|
|
443
|
+
samplerate=SAMPLE_RATE,
|
|
444
|
+
channels=1,
|
|
445
|
+
format="wav",
|
|
446
|
+
) as out_file:
|
|
447
|
+
duration = 0.0
|
|
448
|
+
|
|
449
|
+
# Announce chapter title if enabled
|
|
450
|
+
# Only announce if there are segments to follow
|
|
451
|
+
if self.options.announce_chapters and chapter.title and chapter.segments:
|
|
452
|
+
title_samples = self._runner.synthesize(
|
|
453
|
+
chapter.title,
|
|
454
|
+
lang_code=lang_code,
|
|
455
|
+
pause_mode="tts",
|
|
456
|
+
is_phonemes=False,
|
|
457
|
+
)
|
|
458
|
+
out_file.write(title_samples)
|
|
459
|
+
duration += len(title_samples) / SAMPLE_RATE
|
|
460
|
+
|
|
461
|
+
# Add pause after chapter title
|
|
462
|
+
pause_duration = self.options.chapter_pause_after_title
|
|
463
|
+
if pause_duration > 0:
|
|
464
|
+
pause_samples = int(pause_duration * SAMPLE_RATE)
|
|
465
|
+
pause_audio = np.zeros(pause_samples, dtype=np.float32)
|
|
466
|
+
out_file.write(pause_audio)
|
|
467
|
+
duration += pause_duration
|
|
468
|
+
|
|
469
|
+
if not self._cancel_event.is_set() and chapter.segments:
|
|
470
|
+
# Single pipeline call for entire chapter
|
|
471
|
+
ssmd_text = self._phoneme_segments_to_ssmd(chapter.segments)
|
|
472
|
+
samples = self._runner.synthesize(
|
|
473
|
+
ssmd_text,
|
|
474
|
+
lang_code=lang_code,
|
|
475
|
+
pause_mode=cast(
|
|
476
|
+
Literal["tts", "manual", "auto"], self.options.pause_mode
|
|
477
|
+
),
|
|
478
|
+
is_phonemes=True,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
out_file.write(samples)
|
|
482
|
+
duration += len(samples) / SAMPLE_RATE
|
|
483
|
+
segments_processed = total_segments
|
|
484
|
+
|
|
485
|
+
# Update progress once per chapter
|
|
486
|
+
if progress and self.progress_callback:
|
|
487
|
+
progress.current_segment = segments_processed
|
|
488
|
+
progress.segments_processed = segments_before + segments_processed
|
|
489
|
+
ch_title = chapter.title or "chapter"
|
|
490
|
+
progress.current_text = (
|
|
491
|
+
f"Completed {ch_title} ({segments_processed} segments)"
|
|
492
|
+
)
|
|
493
|
+
if start_time and progress.total_segments_all > 0:
|
|
494
|
+
elapsed = time.time() - start_time
|
|
495
|
+
if progress.segments_processed > 0 and elapsed > 0.5:
|
|
496
|
+
avg_time = elapsed / progress.segments_processed
|
|
497
|
+
remaining = (
|
|
498
|
+
progress.total_segments_all
|
|
499
|
+
- progress.segments_processed
|
|
500
|
+
)
|
|
501
|
+
progress.estimated_remaining = avg_time * remaining
|
|
502
|
+
progress.elapsed_time = elapsed
|
|
503
|
+
self.progress_callback(progress)
|
|
504
|
+
|
|
505
|
+
return duration, segments_processed
|
|
506
|
+
|
|
507
|
+
def _get_selected_chapters(self) -> list[PhonemeChapter]:
|
|
508
|
+
"""Get chapters based on selection option."""
|
|
509
|
+
if not self.options.chapters:
|
|
510
|
+
return list(self.book.chapters)
|
|
511
|
+
|
|
512
|
+
indices = parse_chapter_selection(
|
|
513
|
+
self.options.chapters, len(self.book.chapters)
|
|
514
|
+
)
|
|
515
|
+
return [self.book.chapters[i] for i in indices]
|
|
516
|
+
|
|
517
|
+
def _get_selected_indices(self) -> list[int]:
|
|
518
|
+
"""Get 0-based chapter indices based on selection option."""
|
|
519
|
+
if not self.options.chapters:
|
|
520
|
+
return list(range(len(self.book.chapters)))
|
|
521
|
+
|
|
522
|
+
return parse_chapter_selection(self.options.chapters, len(self.book.chapters))
|
|
523
|
+
|
|
524
|
+
def convert(self, output_path: Path) -> PhonemeConversionResult:
|
|
525
|
+
"""
|
|
526
|
+
Convert PhonemeBook to audio with resume capability.
|
|
527
|
+
|
|
528
|
+
Each chapter is saved as a separate WAV file, allowing conversion
|
|
529
|
+
to be resumed if interrupted. A state file tracks progress.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
output_path: Output file path
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
PhonemeConversionResult with success status and paths
|
|
536
|
+
"""
|
|
537
|
+
selected_chapters = self._get_selected_chapters()
|
|
538
|
+
selected_indices = self._get_selected_indices()
|
|
539
|
+
|
|
540
|
+
if not selected_chapters:
|
|
541
|
+
return PhonemeConversionResult(
|
|
542
|
+
success=False, error_message="No chapters to convert"
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
if self.options.output_format not in SUPPORTED_OUTPUT_FORMATS:
|
|
546
|
+
return PhonemeConversionResult(
|
|
547
|
+
success=False,
|
|
548
|
+
error_message=f"Unsupported format: {self.options.output_format}",
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
self._cancel_event.clear()
|
|
552
|
+
prevent_sleep_start()
|
|
553
|
+
|
|
554
|
+
try:
|
|
555
|
+
# Set up work directory for chapter files (use book title)
|
|
556
|
+
safe_book_title = sanitize_filename(
|
|
557
|
+
self.options.title or self.book.title or output_path.stem
|
|
558
|
+
)[:50]
|
|
559
|
+
work_dir = output_path.parent / f".{safe_book_title}_chapters"
|
|
560
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
561
|
+
state_file = work_dir / f"{safe_book_title}_state.json"
|
|
562
|
+
|
|
563
|
+
# Load or create state
|
|
564
|
+
state: PhonemeConversionState | None = None
|
|
565
|
+
if self.options.resume and state_file.exists():
|
|
566
|
+
state = PhonemeConversionState.load(state_file)
|
|
567
|
+
if state:
|
|
568
|
+
# Check if selected chapters match
|
|
569
|
+
if state.selected_chapters != selected_indices:
|
|
570
|
+
self.log(
|
|
571
|
+
"Chapter selection changed, starting fresh conversion",
|
|
572
|
+
"warning",
|
|
573
|
+
)
|
|
574
|
+
state = None
|
|
575
|
+
# Check if settings differ from saved state
|
|
576
|
+
elif (
|
|
577
|
+
state.voice != self.options.voice
|
|
578
|
+
or state.speed != self.options.speed
|
|
579
|
+
or state.silence_between_chapters
|
|
580
|
+
!= self.options.silence_between_chapters
|
|
581
|
+
or state.pause_clause != self.options.pause_clause
|
|
582
|
+
or state.pause_sentence != self.options.pause_sentence
|
|
583
|
+
or state.pause_paragraph != self.options.pause_paragraph
|
|
584
|
+
or state.pause_variance != self.options.pause_variance
|
|
585
|
+
or state.pause_mode != self.options.pause_mode
|
|
586
|
+
):
|
|
587
|
+
self.log(
|
|
588
|
+
f"Restoring settings from previous session: "
|
|
589
|
+
f"voice={state.voice}, speed={state.speed}, "
|
|
590
|
+
f"silence={state.silence_between_chapters}s, "
|
|
591
|
+
f"pause_clause={state.pause_clause}s, "
|
|
592
|
+
f"pause_sentence={state.pause_sentence}s, "
|
|
593
|
+
f"pause_paragraph={state.pause_paragraph}s, "
|
|
594
|
+
f"pause_variance={state.pause_variance}s, "
|
|
595
|
+
f"pause_mode={state.pause_mode}",
|
|
596
|
+
"info",
|
|
597
|
+
)
|
|
598
|
+
# Apply saved settings for consistency
|
|
599
|
+
self.options.voice = state.voice
|
|
600
|
+
self.options.speed = state.speed
|
|
601
|
+
self.options.output_format = state.output_format
|
|
602
|
+
self.options.silence_between_chapters = (
|
|
603
|
+
state.silence_between_chapters
|
|
604
|
+
)
|
|
605
|
+
self.options.pause_clause = state.pause_clause
|
|
606
|
+
self.options.pause_sentence = state.pause_sentence
|
|
607
|
+
self.options.pause_paragraph = state.pause_paragraph
|
|
608
|
+
self.options.pause_variance = state.pause_variance
|
|
609
|
+
self.options.pause_mode = state.pause_mode
|
|
610
|
+
|
|
611
|
+
if state is None:
|
|
612
|
+
# Create new state
|
|
613
|
+
state = PhonemeConversionState(
|
|
614
|
+
source_file=str(self.book.title),
|
|
615
|
+
output_file=str(output_path),
|
|
616
|
+
work_dir=str(work_dir),
|
|
617
|
+
voice=self.options.voice,
|
|
618
|
+
speed=self.options.speed,
|
|
619
|
+
output_format=self.options.output_format,
|
|
620
|
+
silence_between_chapters=self.options.silence_between_chapters,
|
|
621
|
+
pause_clause=self.options.pause_clause,
|
|
622
|
+
pause_sentence=self.options.pause_sentence,
|
|
623
|
+
pause_paragraph=self.options.pause_paragraph,
|
|
624
|
+
pause_variance=self.options.pause_variance,
|
|
625
|
+
pause_mode=self.options.pause_mode,
|
|
626
|
+
chapters=[
|
|
627
|
+
PhonemeChapterState(
|
|
628
|
+
index=idx,
|
|
629
|
+
title=self.book.chapters[idx].title,
|
|
630
|
+
segment_count=len(self.book.chapters[idx].segments),
|
|
631
|
+
)
|
|
632
|
+
for idx in selected_indices
|
|
633
|
+
],
|
|
634
|
+
started_at=time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
635
|
+
selected_chapters=selected_indices,
|
|
636
|
+
)
|
|
637
|
+
state.save(state_file)
|
|
638
|
+
else:
|
|
639
|
+
completed = state.get_completed_count()
|
|
640
|
+
total = len(selected_chapters)
|
|
641
|
+
self.log(f"Resuming conversion: {completed}/{total} chapters completed")
|
|
642
|
+
|
|
643
|
+
opts = KokoroRunOptions(
|
|
644
|
+
voice=self.options.voice,
|
|
645
|
+
speed=self.options.speed,
|
|
646
|
+
use_gpu=self.options.use_gpu,
|
|
647
|
+
pause_clause=self.options.pause_clause,
|
|
648
|
+
pause_sentence=self.options.pause_sentence,
|
|
649
|
+
pause_paragraph=self.options.pause_paragraph,
|
|
650
|
+
pause_variance=self.options.pause_variance,
|
|
651
|
+
model_path=self.options.model_path,
|
|
652
|
+
voices_path=self.options.voices_path,
|
|
653
|
+
voice_blend=self.options.voice_blend,
|
|
654
|
+
voice_database=self.options.voice_database,
|
|
655
|
+
)
|
|
656
|
+
self._runner = KokoroRunner(opts, log=self.log)
|
|
657
|
+
self._runner.ensure_ready()
|
|
658
|
+
|
|
659
|
+
total_segments = sum(len(ch.segments) for ch in selected_chapters)
|
|
660
|
+
# Account for already completed chapters
|
|
661
|
+
segments_already_done = sum(
|
|
662
|
+
state.chapters[i].segment_count
|
|
663
|
+
for i in range(len(state.chapters))
|
|
664
|
+
if state.chapters[i].completed
|
|
665
|
+
)
|
|
666
|
+
segments_processed = segments_already_done
|
|
667
|
+
start_time = time.time()
|
|
668
|
+
|
|
669
|
+
progress = PhonemeConversionProgress(
|
|
670
|
+
total_chapters=len(selected_chapters),
|
|
671
|
+
total_segments_all=total_segments,
|
|
672
|
+
segments_processed=segments_processed,
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# Convert each chapter
|
|
676
|
+
for state_idx, chapter_state in enumerate(state.chapters):
|
|
677
|
+
if self._cancel_event.is_set():
|
|
678
|
+
state.save(state_file)
|
|
679
|
+
return PhonemeConversionResult(
|
|
680
|
+
success=False,
|
|
681
|
+
error_message="Cancelled",
|
|
682
|
+
chapters_dir=work_dir,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
chapter_idx = chapter_state.index
|
|
686
|
+
chapter = self.book.chapters[chapter_idx]
|
|
687
|
+
|
|
688
|
+
# Skip already completed chapters
|
|
689
|
+
if chapter_state.completed and chapter_state.audio_file:
|
|
690
|
+
chapter_file = work_dir / chapter_state.audio_file
|
|
691
|
+
if chapter_file.exists():
|
|
692
|
+
ch_num = state_idx + 1
|
|
693
|
+
self.log(
|
|
694
|
+
f"Skipping completed chapter {ch_num}: {chapter.title}"
|
|
695
|
+
)
|
|
696
|
+
continue
|
|
697
|
+
else:
|
|
698
|
+
# File missing, need to reconvert
|
|
699
|
+
chapter_state.completed = False
|
|
700
|
+
|
|
701
|
+
progress.current_chapter = state_idx + 1
|
|
702
|
+
progress.chapter_name = chapter.title
|
|
703
|
+
progress.total_segments = len(chapter.segments)
|
|
704
|
+
progress.current_segment = 0
|
|
705
|
+
|
|
706
|
+
ch_num = state_idx + 1
|
|
707
|
+
total_ch = len(state.chapters)
|
|
708
|
+
self.log(f"Converting chapter {ch_num}/{total_ch}: {chapter.title}")
|
|
709
|
+
|
|
710
|
+
# Generate chapter filename using template
|
|
711
|
+
chapter_filename = (
|
|
712
|
+
format_filename_template(
|
|
713
|
+
self.options.chapter_filename_template,
|
|
714
|
+
book_title=self.options.title or self.book.title or "Untitled",
|
|
715
|
+
chapter_title=chapter.title,
|
|
716
|
+
chapter_num=state_idx + 1,
|
|
717
|
+
)
|
|
718
|
+
+ ".wav"
|
|
719
|
+
)
|
|
720
|
+
chapter_file = work_dir / chapter_filename
|
|
721
|
+
|
|
722
|
+
# Convert chapter to WAV
|
|
723
|
+
duration, segs_done = self._convert_chapter_to_wav(
|
|
724
|
+
chapter,
|
|
725
|
+
chapter_file,
|
|
726
|
+
progress=progress,
|
|
727
|
+
start_time=start_time,
|
|
728
|
+
segments_before=segments_processed,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
if self._cancel_event.is_set():
|
|
732
|
+
# Remove incomplete file
|
|
733
|
+
chapter_file.unlink(missing_ok=True)
|
|
734
|
+
state.save(state_file)
|
|
735
|
+
return PhonemeConversionResult(
|
|
736
|
+
success=False,
|
|
737
|
+
error_message="Cancelled",
|
|
738
|
+
chapters_dir=work_dir,
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
# Update state
|
|
742
|
+
chapter_state.completed = True
|
|
743
|
+
chapter_state.audio_file = chapter_filename
|
|
744
|
+
chapter_state.duration = duration
|
|
745
|
+
state.save(state_file)
|
|
746
|
+
|
|
747
|
+
# Update progress
|
|
748
|
+
segments_processed += segs_done
|
|
749
|
+
progress.segments_processed = segments_processed
|
|
750
|
+
elapsed = time.time() - start_time
|
|
751
|
+
if segments_processed > segments_already_done and elapsed > 0.5:
|
|
752
|
+
segs_in_session = segments_processed - segments_already_done
|
|
753
|
+
avg_time = elapsed / segs_in_session
|
|
754
|
+
remaining = total_segments - segments_processed
|
|
755
|
+
progress.estimated_remaining = avg_time * remaining
|
|
756
|
+
progress.elapsed_time = elapsed
|
|
757
|
+
|
|
758
|
+
if self.progress_callback:
|
|
759
|
+
self.progress_callback(progress)
|
|
760
|
+
|
|
761
|
+
# All chapters completed, merge into final output
|
|
762
|
+
self.log("Merging chapters into final audiobook...")
|
|
763
|
+
|
|
764
|
+
chapter_files = [
|
|
765
|
+
work_dir / ch.audio_file for ch in state.chapters if ch.audio_file
|
|
766
|
+
]
|
|
767
|
+
chapter_durations = [ch.duration for ch in state.chapters]
|
|
768
|
+
chapter_titles = [ch.title for ch in state.chapters]
|
|
769
|
+
|
|
770
|
+
meta = MergeMeta(
|
|
771
|
+
fmt=self.options.output_format,
|
|
772
|
+
silence_between_chapters=self.options.silence_between_chapters,
|
|
773
|
+
title=self.options.title,
|
|
774
|
+
author=self.options.author,
|
|
775
|
+
cover_image=self.options.cover_image,
|
|
776
|
+
)
|
|
777
|
+
self._merger.merge_chapter_wavs(
|
|
778
|
+
chapter_files,
|
|
779
|
+
chapter_durations,
|
|
780
|
+
chapter_titles,
|
|
781
|
+
output_path,
|
|
782
|
+
meta,
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
total_duration = sum(chapter_durations)
|
|
786
|
+
self.log(
|
|
787
|
+
f"Conversion complete! Duration: {format_duration(total_duration)}"
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# Clean up work directory if not keeping chapter files
|
|
791
|
+
if not self.options.keep_chapter_files:
|
|
792
|
+
for f in work_dir.iterdir():
|
|
793
|
+
f.unlink()
|
|
794
|
+
work_dir.rmdir()
|
|
795
|
+
work_dir = None # type: ignore
|
|
796
|
+
|
|
797
|
+
return PhonemeConversionResult(
|
|
798
|
+
success=True,
|
|
799
|
+
output_path=output_path,
|
|
800
|
+
chapters_dir=work_dir,
|
|
801
|
+
duration=total_duration,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
except Exception as e:
|
|
805
|
+
import traceback
|
|
806
|
+
|
|
807
|
+
error_msg = f"{str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
|
808
|
+
return PhonemeConversionResult(success=False, error_message=error_msg)
|
|
809
|
+
finally:
|
|
810
|
+
prevent_sleep_end()
|
|
811
|
+
|
|
812
|
+
def convert_streaming(self, output_path: Path) -> PhonemeConversionResult:
|
|
813
|
+
"""
|
|
814
|
+
Convert PhonemeBook to audio in streaming mode.
|
|
815
|
+
|
|
816
|
+
Audio is written directly to the output file/process without
|
|
817
|
+
intermediate chapter files. This is faster but doesn't support
|
|
818
|
+
resume capability.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
output_path: Output file path
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
PhonemeConversionResult with success status and paths
|
|
825
|
+
"""
|
|
826
|
+
selected_chapters = self._get_selected_chapters()
|
|
827
|
+
|
|
828
|
+
if not selected_chapters:
|
|
829
|
+
return PhonemeConversionResult(
|
|
830
|
+
success=False, error_message="No chapters to convert"
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
if self.options.output_format not in SUPPORTED_OUTPUT_FORMATS:
|
|
834
|
+
return PhonemeConversionResult(
|
|
835
|
+
success=False,
|
|
836
|
+
error_message=f"Unsupported format: {self.options.output_format}",
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
self._cancel_event.clear()
|
|
840
|
+
prevent_sleep_start()
|
|
841
|
+
|
|
842
|
+
try:
|
|
843
|
+
opts = KokoroRunOptions(
|
|
844
|
+
voice=self.options.voice,
|
|
845
|
+
speed=self.options.speed,
|
|
846
|
+
use_gpu=self.options.use_gpu,
|
|
847
|
+
pause_clause=self.options.pause_clause,
|
|
848
|
+
pause_sentence=self.options.pause_sentence,
|
|
849
|
+
pause_paragraph=self.options.pause_paragraph,
|
|
850
|
+
pause_variance=self.options.pause_variance,
|
|
851
|
+
model_path=self.options.model_path,
|
|
852
|
+
voices_path=self.options.voices_path,
|
|
853
|
+
voice_blend=self.options.voice_blend,
|
|
854
|
+
voice_database=self.options.voice_database,
|
|
855
|
+
)
|
|
856
|
+
self._runner = KokoroRunner(opts, log=self.log)
|
|
857
|
+
self._runner.ensure_ready()
|
|
858
|
+
|
|
859
|
+
total_segments = sum(len(ch.segments) for ch in selected_chapters)
|
|
860
|
+
segments_processed = 0
|
|
861
|
+
start_time = time.time()
|
|
862
|
+
current_time = 0.0
|
|
863
|
+
chapter_times: list[dict[str, Any]] = []
|
|
864
|
+
|
|
865
|
+
progress = PhonemeConversionProgress(
|
|
866
|
+
total_chapters=len(selected_chapters),
|
|
867
|
+
total_segments_all=total_segments,
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
# Set up output
|
|
871
|
+
out_file, ffmpeg_proc = self._setup_output(output_path)
|
|
872
|
+
|
|
873
|
+
for chapter_idx, chapter in enumerate(selected_chapters):
|
|
874
|
+
if self._cancel_event.is_set():
|
|
875
|
+
break
|
|
876
|
+
|
|
877
|
+
progress.current_chapter = chapter_idx + 1
|
|
878
|
+
progress.chapter_name = chapter.title
|
|
879
|
+
progress.total_segments = len(chapter.segments)
|
|
880
|
+
progress.current_segment = 0
|
|
881
|
+
|
|
882
|
+
ch_num = chapter_idx + 1
|
|
883
|
+
total_ch = len(selected_chapters)
|
|
884
|
+
self.log(f"Converting chapter {ch_num}/{total_ch}: {chapter.title}")
|
|
885
|
+
|
|
886
|
+
chapter_start = current_time
|
|
887
|
+
|
|
888
|
+
total_chapter_segments = len(chapter.segments)
|
|
889
|
+
if not self._cancel_event.is_set() and chapter.segments:
|
|
890
|
+
assert self._runner is not None
|
|
891
|
+
lang_code = (
|
|
892
|
+
get_onnx_lang_code(self.options.lang)
|
|
893
|
+
if self.options.lang
|
|
894
|
+
else (chapter.segments[0].lang if chapter.segments else "en-us")
|
|
895
|
+
)
|
|
896
|
+
ssmd_text = self._phoneme_segments_to_ssmd(chapter.segments)
|
|
897
|
+
samples = self._runner.synthesize(
|
|
898
|
+
ssmd_text,
|
|
899
|
+
lang_code=lang_code,
|
|
900
|
+
pause_mode=cast(
|
|
901
|
+
Literal["tts", "manual", "auto"],
|
|
902
|
+
self.options.pause_mode,
|
|
903
|
+
),
|
|
904
|
+
is_phonemes=True,
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
self._write_audio_chunk(samples, out_file, ffmpeg_proc)
|
|
908
|
+
current_time += len(samples) / SAMPLE_RATE
|
|
909
|
+
segments_processed += total_chapter_segments
|
|
910
|
+
|
|
911
|
+
# Update progress once per chapter
|
|
912
|
+
progress.current_segment = total_chapter_segments
|
|
913
|
+
progress.segments_processed = segments_processed
|
|
914
|
+
progress.current_text = (
|
|
915
|
+
f"Completed {chapter.title} ({total_chapter_segments} segments)"
|
|
916
|
+
)
|
|
917
|
+
if segments_processed > 0:
|
|
918
|
+
elapsed = time.time() - start_time
|
|
919
|
+
if elapsed > 0.5:
|
|
920
|
+
avg_time = elapsed / segments_processed
|
|
921
|
+
remaining = total_segments - segments_processed
|
|
922
|
+
progress.estimated_remaining = avg_time * remaining
|
|
923
|
+
progress.elapsed_time = elapsed
|
|
924
|
+
|
|
925
|
+
if self.progress_callback:
|
|
926
|
+
self.progress_callback(progress)
|
|
927
|
+
|
|
928
|
+
# Record chapter timing
|
|
929
|
+
chapter_times.append(
|
|
930
|
+
{
|
|
931
|
+
"title": chapter.title,
|
|
932
|
+
"start": chapter_start,
|
|
933
|
+
"end": current_time,
|
|
934
|
+
}
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
# Add silence between chapters
|
|
938
|
+
if (
|
|
939
|
+
chapter_idx < len(selected_chapters) - 1
|
|
940
|
+
and self.options.silence_between_chapters > 0
|
|
941
|
+
):
|
|
942
|
+
silence = self._generate_silence(
|
|
943
|
+
self.options.silence_between_chapters
|
|
944
|
+
)
|
|
945
|
+
self._write_audio_chunk(silence, out_file, ffmpeg_proc)
|
|
946
|
+
current_time += self.options.silence_between_chapters
|
|
947
|
+
|
|
948
|
+
# Finalize output
|
|
949
|
+
self._finalize_output(out_file, ffmpeg_proc)
|
|
950
|
+
|
|
951
|
+
if self._cancel_event.is_set():
|
|
952
|
+
# Clean up partial file
|
|
953
|
+
output_path.unlink(missing_ok=True)
|
|
954
|
+
return PhonemeConversionResult(success=False, error_message="Cancelled")
|
|
955
|
+
|
|
956
|
+
# Add chapter markers for m4b
|
|
957
|
+
if self.options.output_format == "m4b" and len(chapter_times) > 1:
|
|
958
|
+
self._merger.add_chapters_to_m4b(
|
|
959
|
+
output_path,
|
|
960
|
+
chapter_times,
|
|
961
|
+
self.options.cover_image,
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
self.log(f"Conversion complete! Duration: {format_duration(current_time)}")
|
|
965
|
+
|
|
966
|
+
return PhonemeConversionResult(
|
|
967
|
+
success=True,
|
|
968
|
+
output_path=output_path,
|
|
969
|
+
duration=current_time,
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
except Exception as e:
|
|
973
|
+
import traceback
|
|
974
|
+
|
|
975
|
+
error_msg = f"{str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
|
976
|
+
return PhonemeConversionResult(success=False, error_message=error_msg)
|
|
977
|
+
finally:
|
|
978
|
+
prevent_sleep_end()
|