ttsforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1033 @@
1
+ """Phoneme-related CLI commands for ttsforge.
2
+
3
+ This module contains commands for working with phonemes and pre-tokenized content:
4
+ - export: Export EPUB books as pre-tokenized phoneme data (JSON)
5
+ - convert: Convert pre-tokenized phoneme files to audiobooks
6
+ - preview: Preview phonemes for given text
7
+ - info: Show information about a phoneme file
8
+ """
9
+
10
+ import re
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import click
16
+ from rich.progress import (
17
+ BarColumn,
18
+ Progress,
19
+ SpinnerColumn,
20
+ TaskID,
21
+ TaskProgressColumn,
22
+ TextColumn,
23
+ TimeElapsedColumn,
24
+ TimeRemainingColumn,
25
+ )
26
+ from rich.prompt import Confirm
27
+ from rich.table import Table
28
+
29
+ from ..chapter_selection import parse_chapter_selection
30
+ from ..constants import (
31
+ LANGUAGE_DESCRIPTIONS,
32
+ SUPPORTED_OUTPUT_FORMATS,
33
+ VOICES,
34
+ )
35
+ from ..utils import (
36
+ format_chapters_range,
37
+ format_filename_template,
38
+ load_config,
39
+ )
40
+ from .helpers import console, parse_voice_parameter
41
+
42
+
43
+ def _require_sounddevice() -> Any:
44
+ try:
45
+ import sounddevice as sd
46
+ except ImportError:
47
+ console.print(
48
+ "[red]Error:[/red] Audio playback requires the optional dependency "
49
+ "'sounddevice'."
50
+ )
51
+ console.print(
52
+ "[yellow]Install with:[/yellow]\n"
53
+ " pip install ttsforge[audio]\n"
54
+ " pip install sounddevice"
55
+ )
56
+ raise SystemExit(1) from None
57
+ return sd
58
+
59
+
60
+ @click.group()
61
+ def phonemes() -> None:
62
+ """Commands for working with phonemes and pre-tokenized content.
63
+
64
+ The phonemes subcommand allows you to:
65
+ - Export EPUB books as pre-tokenized phoneme data (JSON)
66
+ - Export human-readable phoneme representations for review
67
+ - Convert pre-tokenized phoneme files to audiobooks
68
+ - Preview phonemes for given text
69
+
70
+ This is useful for:
71
+ - Reviewing and editing pronunciation before generating audio
72
+ - Faster repeated conversions (skip phonemization step)
73
+ - Archiving phoneme data for different vocabulary versions
74
+ """
75
+ pass
76
+
77
+
78
+ @phonemes.command("export")
79
+ @click.argument("epub_file", type=click.Path(exists=True, path_type=Path))
80
+ @click.option(
81
+ "-o",
82
+ "--output",
83
+ type=click.Path(path_type=Path),
84
+ help="Output file path. Defaults to input filename with .phonemes.json extension.",
85
+ )
86
+ @click.option(
87
+ "--readable",
88
+ is_flag=True,
89
+ help="Export as human-readable text format instead of JSON.",
90
+ )
91
+ @click.option(
92
+ "-l",
93
+ "--language",
94
+ type=click.Choice(list(LANGUAGE_DESCRIPTIONS.keys())),
95
+ default="a",
96
+ help="Language code for phonemization.",
97
+ )
98
+ @click.option(
99
+ "--chapters",
100
+ type=str,
101
+ help="Chapters to export (e.g., '1-5', '1,3,5', 'all').",
102
+ )
103
+ @click.option(
104
+ "--vocab-version",
105
+ type=str,
106
+ default="v1.0",
107
+ help="Vocabulary version to use for tokenization.",
108
+ )
109
+ @click.option(
110
+ "--split-mode",
111
+ "split_mode",
112
+ type=click.Choice(["paragraph", "sentence", "clause"]),
113
+ default="sentence",
114
+ help="Split mode: paragraph (newlines), sentence (spaCy), clause (+ commas).",
115
+ )
116
+ @click.option(
117
+ "--max-chars",
118
+ type=int,
119
+ default=300,
120
+ help="Maximum characters per segment (for additional splitting of long segments).",
121
+ )
122
+ def phonemes_export(
123
+ epub_file: Path,
124
+ output: Path | None,
125
+ readable: bool,
126
+ language: str,
127
+ chapters: str | None,
128
+ vocab_version: str,
129
+ split_mode: str,
130
+ max_chars: int,
131
+ ) -> None:
132
+ """Export an EPUB as pre-tokenized phoneme data.
133
+
134
+ This creates a JSON file containing the book's text converted to
135
+ phonemes and tokens, which can be later converted to audio without
136
+ re-running the phonemization step.
137
+
138
+ Split modes:
139
+ - paragraph: Split only on double newlines (fewer, longer segments)
140
+ - sentence: Split on sentence boundaries using spaCy (recommended)
141
+ - clause: Split on sentences + commas (more, shorter segments)
142
+
143
+ Examples:
144
+
145
+ ttsforge phonemes export book.epub
146
+
147
+ ttsforge phonemes export book.epub --readable -o book.readable.txt
148
+
149
+ ttsforge phonemes export book.epub --language b --chapters 1-5
150
+
151
+ ttsforge phonemes export book.epub --split-mode clause
152
+ """
153
+ from pykokoro.tokenizer import Tokenizer
154
+
155
+ from ..input_reader import InputReader
156
+ from ..phonemes import PhonemeBook
157
+
158
+ config = load_config()
159
+
160
+ console.print(f"[bold]Loading:[/bold] {epub_file}")
161
+
162
+ # Parse file
163
+ try:
164
+ reader = InputReader(epub_file)
165
+ metadata = reader.get_metadata()
166
+ epub_chapters = reader.get_chapters()
167
+ except Exception as e:
168
+ console.print(f"[red]Error loading file:[/red] {e}")
169
+ sys.exit(1)
170
+
171
+ if not epub_chapters:
172
+ console.print("[red]Error:[/red] No chapters found in file.")
173
+ sys.exit(1)
174
+
175
+ # Chapter selection
176
+ selected_indices: list[int] | None = None
177
+ if chapters:
178
+ try:
179
+ selected_indices = parse_chapter_selection(chapters, len(epub_chapters))
180
+ except ValueError as exc:
181
+ console.print(f"[yellow]{exc}[/yellow]")
182
+ sys.exit(1)
183
+
184
+ # Get effective title and author
185
+ default_title = config.get("default_title", "Untitled")
186
+ effective_title = metadata.title or default_title
187
+ effective_author = metadata.authors[0] if metadata.authors else "Unknown"
188
+
189
+ # Compute chapters range for filename and metadata
190
+ chapters_range = format_chapters_range(
191
+ selected_indices or list(range(len(epub_chapters))), len(epub_chapters)
192
+ )
193
+
194
+ # Determine output path using template
195
+ if output is None:
196
+ output_template = config.get("phoneme_export_template", "{book_title}")
197
+ output_filename = format_filename_template(
198
+ output_template,
199
+ book_title=effective_title,
200
+ author=effective_author,
201
+ input_stem=epub_file.stem,
202
+ chapters_range=chapters_range,
203
+ default_title=default_title,
204
+ )
205
+ # Append chapters range to filename if partial selection
206
+ if chapters_range:
207
+ output_filename = f"{output_filename}_{chapters_range}"
208
+ suffix = ".readable.txt" if readable else ".phonemes.json"
209
+ output = epub_file.parent / f"{output_filename}{suffix}"
210
+
211
+ # Get language code for espeak
212
+ from pykokoro.onnx_backend import LANG_CODE_TO_ONNX
213
+
214
+ espeak_lang = LANG_CODE_TO_ONNX.get(language, "en-us")
215
+
216
+ # Initialize tokenizer
217
+ console.print(f"[dim]Initializing tokenizer (vocab: {vocab_version})...[/dim]")
218
+ try:
219
+ tokenizer = Tokenizer(vocab_version=vocab_version)
220
+ except Exception as e:
221
+ console.print(f"[red]Error initializing tokenizer:[/red] {e}")
222
+ sys.exit(1)
223
+
224
+ # Create PhonemeBook with chapters_range in metadata
225
+ book = PhonemeBook(
226
+ title=effective_title,
227
+ vocab_version=vocab_version,
228
+ lang=espeak_lang,
229
+ metadata={
230
+ "source": str(epub_file),
231
+ "author": effective_author,
232
+ "split_mode": split_mode,
233
+ "chapters_range": chapters_range,
234
+ "total_source_chapters": len(epub_chapters),
235
+ },
236
+ )
237
+
238
+ console.print(f"[dim]Split mode: {split_mode}, Max chars: {max_chars}[/dim]")
239
+
240
+ # Track warnings for long phonemes
241
+ phoneme_warnings: list[str] = []
242
+
243
+ def warn_callback(msg: str) -> None:
244
+ """Collect phoneme length warnings."""
245
+ phoneme_warnings.append(msg)
246
+
247
+ # Process chapters
248
+ with Progress(
249
+ SpinnerColumn(),
250
+ TextColumn("[progress.description]{task.description}"),
251
+ BarColumn(),
252
+ TaskProgressColumn(),
253
+ console=console,
254
+ ) as progress:
255
+ num_chapters = len(selected_indices) if selected_indices else len(epub_chapters)
256
+ task = progress.add_task("Phonemizing chapters...", total=num_chapters)
257
+
258
+ for i, ch in enumerate(epub_chapters):
259
+ if selected_indices is not None and i not in selected_indices:
260
+ continue
261
+
262
+ chapter = book.create_chapter(ch.title)
263
+
264
+ # Remove <<CHAPTER: ...>> markers that epub2text adds
265
+ # at the start of content since we now announce chapter titles
266
+ # separately
267
+ content = ch.text
268
+ content = re.sub(
269
+ r"^\s*<<CHAPTER:[^>]*>>\s*\n*", "", content, count=1, flags=re.MULTILINE
270
+ )
271
+
272
+ # Pass entire chapter text - add_text handles splitting based on split_mode
273
+ if content.strip():
274
+ chapter.add_text(
275
+ content,
276
+ tokenizer,
277
+ lang=espeak_lang,
278
+ split_mode=split_mode,
279
+ max_chars=max_chars,
280
+ warn_callback=warn_callback,
281
+ )
282
+
283
+ progress.advance(task)
284
+
285
+ # Show warnings if any
286
+ if phoneme_warnings:
287
+ console.print(
288
+ f"\n[yellow]Warning:[/yellow] {len(phoneme_warnings)} segment(s) had "
289
+ f"phonemes exceeding the 510 character limit and were truncated."
290
+ )
291
+ if len(phoneme_warnings) <= 5:
292
+ for w in phoneme_warnings:
293
+ console.print(f" [dim]{w}[/dim]")
294
+ else:
295
+ for w in phoneme_warnings[:3]:
296
+ console.print(f" [dim]{w}[/dim]")
297
+ console.print(f" [dim]... and {len(phoneme_warnings) - 3} more[/dim]")
298
+
299
+ # Save output
300
+ if readable:
301
+ book.save_readable(output)
302
+ else:
303
+ book.save(output)
304
+
305
+ console.print(f"[green]Exported to:[/green] {output}")
306
+ console.print(
307
+ f"[dim]Chapters: {len(book.chapters)}, "
308
+ f"Segments: {book.total_segments}, "
309
+ f"Tokens: {book.total_tokens:,}[/dim]"
310
+ )
311
+
312
+
313
+ @phonemes.command("convert")
314
+ @click.argument("phoneme_file", type=click.Path(exists=True, path_type=Path))
315
+ @click.option(
316
+ "-o",
317
+ "--output",
318
+ type=click.Path(path_type=Path),
319
+ help="Output file path. Defaults to input filename with audio extension.",
320
+ )
321
+ @click.option(
322
+ "-f",
323
+ "--format",
324
+ "output_format",
325
+ type=click.Choice(SUPPORTED_OUTPUT_FORMATS),
326
+ help="Output audio format.",
327
+ )
328
+ @click.option("-v", "--voice", type=click.Choice(VOICES), help="Voice to use for TTS.")
329
+ @click.option("-s", "--speed", type=float, default=1.0, help="Speech speed.")
330
+ @click.option(
331
+ "--gpu/--no-gpu",
332
+ "use_gpu",
333
+ default=None,
334
+ help="Enable/disable GPU acceleration.",
335
+ )
336
+ @click.option(
337
+ "--silence",
338
+ type=float,
339
+ default=2.0,
340
+ help="Silence between chapters in seconds.",
341
+ )
342
+ @click.option(
343
+ "--pause-clause",
344
+ type=float,
345
+ default=None,
346
+ help="Pause after clauses in seconds (default: 0.25).",
347
+ )
348
+ @click.option(
349
+ "--pause-sentence",
350
+ type=float,
351
+ default=None,
352
+ help="Pause after sentences in seconds (default: 0.2).",
353
+ )
354
+ @click.option(
355
+ "--pause-paragraph",
356
+ type=float,
357
+ default=None,
358
+ help="Pause after paragraphs in seconds (default: 0.75).",
359
+ )
360
+ @click.option(
361
+ "--pause-variance",
362
+ type=float,
363
+ default=None,
364
+ help="Random variance added to pauses in seconds (default: 0.05).",
365
+ )
366
+ @click.option(
367
+ "--pause-mode",
368
+ type=str,
369
+ default=None,
370
+ help="auto, manual or tts (default: auto).",
371
+ )
372
+ @click.option(
373
+ "--announce-chapters/--no-announce-chapters",
374
+ "announce_chapters",
375
+ default=None,
376
+ help="Read chapter titles aloud before chapter content (default: enabled).",
377
+ )
378
+ @click.option(
379
+ "--chapter-pause",
380
+ type=float,
381
+ default=None,
382
+ help="Pause duration after chapter title announcement in seconds (default: 2.0).",
383
+ )
384
+ @click.option(
385
+ "--chapters",
386
+ type=str,
387
+ default=None,
388
+ help="Select chapters to convert (1-based). E.g., '1-5', '3,5,7', or '1-3,7'.",
389
+ )
390
+ @click.option(
391
+ "--title",
392
+ type=str,
393
+ default=None,
394
+ help="Audiobook title (for m4b metadata).",
395
+ )
396
+ @click.option(
397
+ "--author",
398
+ type=str,
399
+ default=None,
400
+ help="Audiobook author (for m4b metadata).",
401
+ )
402
+ @click.option(
403
+ "--cover",
404
+ type=click.Path(exists=True, path_type=Path),
405
+ default=None,
406
+ help="Cover image path (for m4b format).",
407
+ )
408
+ @click.option(
409
+ "--voice-blend",
410
+ type=str,
411
+ default=None,
412
+ help="Blend multiple voices. E.g., 'af_nicole:50,am_michael:50'.",
413
+ )
414
+ @click.option(
415
+ "--voice-database",
416
+ type=click.Path(exists=True, path_type=Path),
417
+ default=None,
418
+ help="Path to custom voice database (SQLite).",
419
+ )
420
+ @click.option(
421
+ "--streaming/--no-streaming",
422
+ "streaming",
423
+ default=False,
424
+ help="Use streaming mode (faster, no resume). Default: resumable.",
425
+ )
426
+ @click.option(
427
+ "--keep-chapters",
428
+ is_flag=True,
429
+ help="Keep intermediate chapter files after merging.",
430
+ )
431
+ @click.option(
432
+ "-y",
433
+ "--yes",
434
+ is_flag=True,
435
+ help="Skip confirmation prompts.",
436
+ )
437
+ @click.pass_context
438
+ def phonemes_convert(
439
+ ctx: click.Context,
440
+ phoneme_file: Path,
441
+ output: Path | None,
442
+ output_format: str | None,
443
+ voice: str | None,
444
+ speed: float,
445
+ use_gpu: bool | None,
446
+ silence: float,
447
+ pause_clause: float | None,
448
+ pause_sentence: float | None,
449
+ pause_paragraph: float | None,
450
+ pause_variance: float | None,
451
+ pause_mode: str | None,
452
+ announce_chapters: bool | None,
453
+ chapter_pause: float | None,
454
+ chapters: str | None,
455
+ title: str | None,
456
+ author: str | None,
457
+ cover: Path | None,
458
+ voice_blend: str | None,
459
+ voice_database: Path | None,
460
+ streaming: bool,
461
+ keep_chapters: bool,
462
+ yes: bool,
463
+ ) -> None:
464
+ """Convert a pre-tokenized phoneme file to audio.
465
+
466
+ PHONEME_FILE should be a JSON file created by 'ttsforge phonemes export'.
467
+
468
+ By default, conversion is resumable (chapter-at-a-time mode). If interrupted,
469
+ re-running the same command will resume from the last completed chapter.
470
+
471
+ Use --streaming for faster conversion without resume capability.
472
+
473
+ Examples:
474
+
475
+ ttsforge phonemes convert book.phonemes.json
476
+
477
+ ttsforge phonemes convert book.phonemes.json -v am_adam -o book.m4b
478
+
479
+ ttsforge phonemes convert book.phonemes.json --chapters 1-5
480
+
481
+ ttsforge phonemes convert book.phonemes.json --streaming
482
+ """
483
+ from ..phoneme_conversion import (
484
+ PhonemeConversionOptions,
485
+ PhonemeConversionProgress,
486
+ PhonemeConverter,
487
+ parse_chapter_selection,
488
+ )
489
+ from ..phonemes import PhonemeBook
490
+
491
+ console.print(f"[bold]Loading:[/bold] {phoneme_file}")
492
+
493
+ try:
494
+ book = PhonemeBook.load(phoneme_file)
495
+ except Exception as e:
496
+ console.print(f"[red]Error loading phoneme file:[/red] {e}")
497
+ sys.exit(1)
498
+
499
+ # Load config for defaults
500
+ config = load_config()
501
+ model_path = ctx.obj.get("model_path") if ctx.obj else None
502
+ voices_path = ctx.obj.get("voices_path") if ctx.obj else None
503
+
504
+ # Get book info and metadata
505
+ book_info = book.get_info()
506
+ book_metadata = book_info.get("metadata", {})
507
+ default_title = config.get("default_title", "Untitled")
508
+
509
+ # Use CLI title/author if provided, otherwise use book metadata
510
+ effective_title = (
511
+ title if title is not None else book_info.get("title", default_title)
512
+ )
513
+ effective_author = (
514
+ author if author is not None else book_metadata.get("author", "Unknown")
515
+ )
516
+
517
+ # Validate chapter selection if provided
518
+ selected_indices: list[int] = []
519
+ if chapters:
520
+ try:
521
+ selected_indices = parse_chapter_selection(chapters, len(book.chapters))
522
+ except ValueError as e:
523
+ console.print(f"[red]Invalid chapter selection:[/red] {e}")
524
+ sys.exit(1)
525
+
526
+ # Compute chapters range for filename
527
+ # Use metadata chapters_range if converting all chapters from a partial export
528
+ stored_chapters_range = book_metadata.get("chapters_range", "")
529
+ if selected_indices:
530
+ # New selection on top of potentially partial export
531
+ chapters_range = format_chapters_range(selected_indices, len(book.chapters))
532
+ else:
533
+ # Use stored range if available
534
+ chapters_range = stored_chapters_range
535
+
536
+ # Determine output format and path
537
+ fmt = output_format or config.get("default_format", "m4b")
538
+ if output is None:
539
+ output_template = config.get("output_filename_template", "{book_title}")
540
+ output_filename = format_filename_template(
541
+ output_template,
542
+ book_title=effective_title,
543
+ author=effective_author,
544
+ input_stem=phoneme_file.stem,
545
+ chapters_range=chapters_range,
546
+ default_title=default_title,
547
+ )
548
+ # Append chapters range to filename if partial selection
549
+ if chapters_range:
550
+ output_filename = f"{output_filename}_{chapters_range}"
551
+ output = phoneme_file.parent / f"{output_filename}.{fmt}"
552
+
553
+ # Get voice
554
+ if voice is None:
555
+ voice = config.get("default_voice", "af_heart")
556
+
557
+ # Get GPU setting
558
+ gpu = use_gpu if use_gpu is not None else config.get("use_gpu", False)
559
+
560
+ # Calculate total segments for selected chapters
561
+ if selected_indices:
562
+ selected_chapter_count = len(selected_indices)
563
+ total_segments = sum(len(book.chapters[i].segments) for i in selected_indices)
564
+ else:
565
+ selected_chapter_count = len(book.chapters)
566
+ total_segments = book.total_segments
567
+
568
+ # Show info
569
+ console.print(f"[dim]Title: {effective_title}[/dim]")
570
+ if selected_indices:
571
+ ch_count = f"{selected_chapter_count}/{book_info['chapters']}"
572
+ console.print(
573
+ f"[dim]Chapters: {ch_count} (selected), Segments: {total_segments}[/dim]"
574
+ )
575
+ else:
576
+ console.print(
577
+ f"[dim]Chapters: {book_info['chapters']}, "
578
+ f"Segments: {book_info['segments']}, "
579
+ f"Tokens: {book_info['tokens']:,}[/dim]"
580
+ )
581
+
582
+ if voice_blend:
583
+ console.print(f"[dim]Voice blend: {voice_blend}[/dim]")
584
+ else:
585
+ console.print(f"[dim]Voice: {voice}, Speed: {speed}x[/dim]")
586
+
587
+ console.print(f"[dim]Output: {output} (format: {fmt})[/dim]")
588
+ mode_str = "streaming" if streaming else "resumable (chapter-at-a-time)"
589
+ console.print(f"[dim]Mode: {mode_str}[/dim]")
590
+
591
+ if not yes:
592
+ if not Confirm.ask("Proceed with conversion?"):
593
+ console.print("[yellow]Cancelled.[/yellow]")
594
+ return
595
+
596
+ # Create conversion options
597
+ options = PhonemeConversionOptions(
598
+ voice=voice or config.get("default_voice", "af_heart"),
599
+ speed=speed,
600
+ output_format=fmt,
601
+ use_gpu=gpu,
602
+ silence_between_chapters=silence,
603
+ pause_clause=(
604
+ pause_clause
605
+ if pause_clause is not None
606
+ else config.get("pause_clause", 0.25)
607
+ ),
608
+ pause_sentence=(
609
+ pause_sentence
610
+ if pause_sentence is not None
611
+ else config.get("pause_sentence", 0.2)
612
+ ),
613
+ pause_paragraph=(
614
+ pause_paragraph
615
+ if pause_paragraph is not None
616
+ else config.get("pause_paragraph", 0.75)
617
+ ),
618
+ pause_variance=(
619
+ pause_variance
620
+ if pause_variance is not None
621
+ else config.get("pause_variance", 0.05)
622
+ ),
623
+ pause_mode=(
624
+ pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
625
+ ),
626
+ announce_chapters=(
627
+ announce_chapters
628
+ if announce_chapters is not None
629
+ else config.get("announce_chapters", True)
630
+ ),
631
+ chapter_pause_after_title=(
632
+ chapter_pause
633
+ if chapter_pause is not None
634
+ else config.get("chapter_pause_after_title", 2.0)
635
+ ),
636
+ title=effective_title,
637
+ author=effective_author,
638
+ cover_image=cover,
639
+ voice_blend=voice_blend,
640
+ voice_database=voice_database,
641
+ chapters=chapters,
642
+ resume=not streaming, # Resume only in chapter-at-a-time mode
643
+ keep_chapter_files=keep_chapters,
644
+ chapter_filename_template=config.get(
645
+ "chapter_filename_template",
646
+ "{chapter_num:03d}_{book_title}_{chapter_title}",
647
+ ),
648
+ model_path=model_path,
649
+ voices_path=voices_path,
650
+ )
651
+
652
+ # Progress tracking with Rich
653
+ progress_bar: Progress | None = None
654
+ task_id: TaskID | None = None
655
+
656
+ def log_callback(message: str, level: str) -> None:
657
+ """Handle log messages."""
658
+ if level == "warning":
659
+ console.print(f"[yellow]{message}[/yellow]")
660
+ elif level == "error":
661
+ console.print(f"[red]{message}[/red]")
662
+ else:
663
+ console.print(f"[dim]{message}[/dim]")
664
+
665
+ def progress_callback(prog: PhonemeConversionProgress) -> None:
666
+ """Update progress display."""
667
+ nonlocal progress_bar, task_id
668
+ if progress_bar is not None and task_id is not None:
669
+ ch_progress = f"Ch {prog.current_chapter}/{prog.total_chapters}"
670
+ progress_bar.update(
671
+ task_id,
672
+ completed=prog.segments_processed,
673
+ description=f"[cyan]{ch_progress}[/cyan]",
674
+ )
675
+
676
+ # Create converter
677
+ converter = PhonemeConverter(
678
+ book=book,
679
+ options=options,
680
+ progress_callback=progress_callback,
681
+ log_callback=log_callback,
682
+ )
683
+
684
+ # Run conversion with progress bar
685
+ with Progress(
686
+ SpinnerColumn(),
687
+ TextColumn("[progress.description]{task.description}"),
688
+ BarColumn(),
689
+ TaskProgressColumn(),
690
+ TextColumn("[dim]{task.fields[segment_info]}[/dim]"),
691
+ TimeElapsedColumn(),
692
+ TimeRemainingColumn(),
693
+ console=console,
694
+ transient=False,
695
+ ) as progress:
696
+ progress_bar = progress
697
+ task_id = progress.add_task(
698
+ "[cyan]Converting...[/cyan]",
699
+ total=total_segments,
700
+ segment_info="",
701
+ )
702
+
703
+ # Choose conversion mode
704
+ if streaming:
705
+ result = converter.convert_streaming(output)
706
+ else:
707
+ result = converter.convert(output)
708
+
709
+ # Mark complete
710
+ progress.update(task_id, completed=total_segments)
711
+
712
+ # Show result
713
+ if result.success:
714
+ console.print("\n[green]Conversion complete![/green]")
715
+ console.print(f"[bold]Output:[/bold] {result.output_path}")
716
+ if result.duration > 0:
717
+ from ..utils import format_duration
718
+
719
+ console.print(f"[dim]Duration: {format_duration(result.duration)}[/dim]")
720
+ else:
721
+ console.print(f"\n[red]Conversion failed:[/red] {result.error_message}")
722
+ sys.exit(1)
723
+
724
+
725
+ @phonemes.command("preview")
726
+ @click.argument("text")
727
+ @click.option(
728
+ "-l",
729
+ "--language",
730
+ type=str,
731
+ default="a",
732
+ help="Language code for phonemization (e.g., 'de', 'en-us', 'a' for auto).",
733
+ )
734
+ @click.option(
735
+ "--tokens",
736
+ is_flag=True,
737
+ help="Show token IDs in addition to phonemes.",
738
+ )
739
+ @click.option(
740
+ "--vocab-version",
741
+ type=str,
742
+ default="v1.0",
743
+ help="Vocabulary version to use.",
744
+ )
745
+ @click.option(
746
+ "-p",
747
+ "--play",
748
+ is_flag=True,
749
+ help="Play audio preview of the text.",
750
+ )
751
+ @click.option(
752
+ "-v",
753
+ "--voice",
754
+ type=str,
755
+ default="af_sky",
756
+ help=(
757
+ "Voice to use for audio preview, or voice blend "
758
+ "(e.g., 'af_nicole:50,am_michael:50')."
759
+ ),
760
+ )
761
+ @click.option(
762
+ "--phoneme-dict",
763
+ type=click.Path(exists=True, path_type=Path),
764
+ default=None,
765
+ help="Path to custom phoneme dictionary file.",
766
+ )
767
+ def phonemes_preview(
768
+ text: str,
769
+ language: str,
770
+ tokens: bool,
771
+ vocab_version: str,
772
+ play: bool,
773
+ voice: str,
774
+ phoneme_dict: Path | None,
775
+ ) -> None:
776
+ """Preview phonemes for given text.
777
+
778
+ Shows how text will be converted to phonemes and optionally tokens.
779
+ Use --play to hear the audio output.
780
+
781
+ Examples:
782
+
783
+ ttsforge phonemes preview "Hello world"
784
+
785
+ ttsforge phonemes preview "Hello world" --tokens
786
+
787
+ ttsforge phonemes preview "Hello world" --language de
788
+
789
+ ttsforge phonemes preview "König" --language de --play
790
+
791
+ ttsforge phonemes preview "Hermione" --play --phoneme-dict custom.json
792
+
793
+ ttsforge phonemes preview "Hello" --play --voice "af_nicole:50,am_michael:50"
794
+ """
795
+ from pykokoro.onnx_backend import LANG_CODE_TO_ONNX
796
+ from pykokoro.tokenizer import Tokenizer
797
+
798
+ # Map language code - support both short codes and ISO codes
799
+ if language in LANG_CODE_TO_ONNX:
800
+ espeak_lang = LANG_CODE_TO_ONNX[language]
801
+ else:
802
+ # Assume it's already an ISO code like 'de', 'en-us', etc.
803
+ espeak_lang = language
804
+
805
+ try:
806
+ tokenizer = Tokenizer(vocab_version=vocab_version)
807
+ except Exception as e:
808
+ console.print(f"[red]Error initializing tokenizer:[/red] {e}")
809
+ sys.exit(1)
810
+
811
+ phonemes = tokenizer.phonemize(text, lang=espeak_lang)
812
+ readable = tokenizer.format_readable(text, lang=espeak_lang)
813
+
814
+ console.print(f"[bold]Text:[/bold] {text}")
815
+ lang_desc = LANGUAGE_DESCRIPTIONS.get(language, language)
816
+ console.print(f"[bold]Language:[/bold] {lang_desc} ({espeak_lang})")
817
+ console.print(f"[bold]Phonemes:[/bold] {phonemes}")
818
+ console.print(f"[bold]Readable:[/bold] {readable}")
819
+
820
+ if tokens:
821
+ token_ids = tokenizer.tokenize(phonemes)
822
+ console.print(f"[bold]Tokens:[/bold] {token_ids}")
823
+ console.print(f"[dim]Token count: {len(token_ids)}[/dim]")
824
+
825
+ # Audio preview
826
+ if play:
827
+ import tempfile
828
+
829
+ from ..conversion import ConversionOptions, TTSConverter
830
+
831
+ console.print("\n[bold]Generating audio preview...[/bold]")
832
+
833
+ try:
834
+ # Auto-detect if voice is a blend
835
+ parsed_voice, parsed_voice_blend = parse_voice_parameter(voice)
836
+
837
+ # Initialize converter
838
+ options = ConversionOptions(
839
+ phoneme_dictionary_path=str(phoneme_dict) if phoneme_dict else None,
840
+ voice=parsed_voice or "af_sky", # Fallback to default if blend
841
+ voice_blend=parsed_voice_blend,
842
+ language=language,
843
+ output_format="wav", # Explicitly set WAV format
844
+ )
845
+ converter = TTSConverter(options)
846
+
847
+ # Create temp file
848
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
849
+ temp_output = Path(tmp.name)
850
+
851
+ try:
852
+ # Generate audio
853
+ result = converter.convert_text(text, temp_output)
854
+
855
+ if result.success:
856
+ # Play the audio
857
+ import soundfile as sf
858
+
859
+ audio_data, sample_rate = sf.read(str(temp_output))
860
+ console.print("[dim]▶ Playing...[/dim]")
861
+ sd = _require_sounddevice()
862
+ sd.play(audio_data, sample_rate)
863
+ sd.wait()
864
+ console.print("[green]✓ Playback complete[/green]")
865
+ else:
866
+ console.print(f"[red]Error:[/red] {result.error_message}")
867
+
868
+ finally:
869
+ # Cleanup temp file
870
+ if temp_output.exists():
871
+ temp_output.unlink()
872
+
873
+ except Exception as e:
874
+ console.print(f"[red]Error playing audio:[/red] {e}")
875
+ import traceback
876
+
877
+ console.print(f"[dim]{traceback.format_exc()}[/dim]")
878
+ sys.exit(1)
879
+
880
+
881
+ @phonemes.command("info")
882
+ @click.argument("phoneme_file", type=click.Path(exists=True, path_type=Path))
883
+ @click.option(
884
+ "--stats",
885
+ is_flag=True,
886
+ help="Show detailed token statistics.",
887
+ )
888
+ def phonemes_info(phoneme_file: Path, stats: bool) -> None:
889
+ """Show information about a phoneme file.
890
+
891
+ PHONEME_FILE should be a JSON file created by 'ttsforge phonemes export'.
892
+
893
+ Use --stats to show detailed token statistics (min, median, mean, max).
894
+ """
895
+ from ..phonemes import PhonemeBook
896
+
897
+ try:
898
+ book = PhonemeBook.load(phoneme_file)
899
+ except Exception as e:
900
+ console.print(f"[red]Error loading phoneme file:[/red] {e}")
901
+ sys.exit(1)
902
+
903
+ info = book.get_info()
904
+
905
+ table = Table(title=f"Phoneme File: {phoneme_file.name}")
906
+ table.add_column("Property", style="bold")
907
+ table.add_column("Value")
908
+
909
+ table.add_row("Title", info["title"])
910
+ table.add_row("Vocabulary", info["vocab_version"])
911
+ table.add_row("Language", info["lang"])
912
+ table.add_row("Chapters", str(info["chapters"]))
913
+ table.add_row("Segments", str(info["segments"]))
914
+ table.add_row("Tokens", f"{info['tokens']:,}")
915
+ table.add_row("Phonemes", f"{info['phonemes']:,}")
916
+
917
+ if info.get("metadata"):
918
+ for key, value in info["metadata"].items():
919
+ table.add_row(f"Meta: {key}", str(value))
920
+
921
+ console.print(table)
922
+
923
+ # Collect token counts per segment for statistics
924
+ token_counts = [len(seg.tokens) for _, seg in book.iter_segments()]
925
+ char_counts = [len(seg.text) for _, seg in book.iter_segments()]
926
+ phoneme_counts = [len(seg.phonemes) for _, seg in book.iter_segments()]
927
+
928
+ if token_counts and stats:
929
+ import statistics
930
+
931
+ # Token statistics
932
+ console.print("\n[bold]Segment Statistics:[/bold]")
933
+ stats_table = Table(show_header=True)
934
+ stats_table.add_column("Metric", style="bold")
935
+ stats_table.add_column("Tokens", justify="right")
936
+ stats_table.add_column("Characters", justify="right")
937
+ stats_table.add_column("Phonemes", justify="right")
938
+
939
+ stats_table.add_row(
940
+ "Count",
941
+ str(len(token_counts)),
942
+ str(len(char_counts)),
943
+ str(len(phoneme_counts)),
944
+ )
945
+ stats_table.add_row(
946
+ "Min",
947
+ str(min(token_counts)),
948
+ str(min(char_counts)),
949
+ str(min(phoneme_counts)),
950
+ )
951
+ stats_table.add_row(
952
+ "Max",
953
+ str(max(token_counts)),
954
+ str(max(char_counts)),
955
+ str(max(phoneme_counts)),
956
+ )
957
+ stats_table.add_row(
958
+ "Mean",
959
+ f"{statistics.mean(token_counts):.1f}",
960
+ f"{statistics.mean(char_counts):.1f}",
961
+ f"{statistics.mean(phoneme_counts):.1f}",
962
+ )
963
+ stats_table.add_row(
964
+ "Median",
965
+ f"{statistics.median(token_counts):.1f}",
966
+ f"{statistics.median(char_counts):.1f}",
967
+ f"{statistics.median(phoneme_counts):.1f}",
968
+ )
969
+ if len(token_counts) > 1:
970
+ stats_table.add_row(
971
+ "Std Dev",
972
+ f"{statistics.stdev(token_counts):.1f}",
973
+ f"{statistics.stdev(char_counts):.1f}",
974
+ f"{statistics.stdev(phoneme_counts):.1f}",
975
+ )
976
+
977
+ console.print(stats_table)
978
+
979
+ # Token distribution histogram (simple text-based)
980
+ console.print("\n[bold]Token Distribution:[/bold]")
981
+ # Create buckets
982
+ buckets = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, float("inf")]
983
+ bucket_labels = [
984
+ "0-49",
985
+ "50-99",
986
+ "100-149",
987
+ "150-199",
988
+ "200-249",
989
+ "250-299",
990
+ "300-349",
991
+ "350-399",
992
+ "400-449",
993
+ "450-499",
994
+ "500+",
995
+ ]
996
+ bucket_counts = [0] * (len(buckets) - 1)
997
+
998
+ for count in token_counts:
999
+ for i in range(len(buckets) - 1):
1000
+ if buckets[i] <= count < buckets[i + 1]:
1001
+ bucket_counts[i] += 1
1002
+ break
1003
+
1004
+ max_count = max(bucket_counts) if bucket_counts else 1
1005
+ bar_width = 30
1006
+
1007
+ for label, count in zip(bucket_labels, bucket_counts, strict=False):
1008
+ if count > 0 or label in [
1009
+ "0-49",
1010
+ "50-99",
1011
+ "100-149",
1012
+ ]: # Always show first few
1013
+ bar_len = int((count / max_count) * bar_width) if max_count > 0 else 0
1014
+ bar = "█" * bar_len
1015
+ console.print(f" {label:>8} │ {bar:<{bar_width}} {count:>4}")
1016
+
1017
+ # Show chapters
1018
+ console.print("\n[bold]Chapters:[/bold]")
1019
+ chapter_table = Table(show_header=True)
1020
+ chapter_table.add_column("#", style="dim", width=4)
1021
+ chapter_table.add_column("Title")
1022
+ chapter_table.add_column("Segments", justify="right")
1023
+ chapter_table.add_column("Tokens", justify="right")
1024
+
1025
+ for i, chapter in enumerate(book.chapters, 1):
1026
+ chapter_table.add_row(
1027
+ str(i),
1028
+ chapter.title[:50],
1029
+ str(len(chapter.segments)),
1030
+ f"{chapter.total_tokens:,}",
1031
+ )
1032
+
1033
+ console.print(chapter_table)