ttsforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1389 @@
1
+ """Utility commands for ttsforge CLI."""
2
+
3
+ import re
4
+ import shutil
5
+ import sys
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import Any, Literal, TypeAlias, cast
9
+
10
+ import click
11
+ import numpy as np
12
+ from pykokoro import GenerationConfig, KokoroPipeline, PipelineConfig
13
+ from pykokoro.onnx_backend import (
14
+ DEFAULT_MODEL_QUALITY,
15
+ DEFAULT_MODEL_SOURCE,
16
+ DEFAULT_MODEL_VARIANT,
17
+ GITHUB_VOICES_FILENAME_V1_0,
18
+ GITHUB_VOICES_FILENAME_V1_1_DE,
19
+ GITHUB_VOICES_FILENAME_V1_1_ZH,
20
+ LANG_CODE_TO_ONNX,
21
+ MODEL_QUALITY_FILES,
22
+ VOICE_NAMES_BY_VARIANT,
23
+ Kokoro,
24
+ ModelQuality,
25
+ VoiceBlend,
26
+ download_all_voices,
27
+ download_config,
28
+ download_model,
29
+ download_model_github,
30
+ download_voices_github,
31
+ get_config_path,
32
+ get_model_dir,
33
+ get_model_path,
34
+ get_voices_dir,
35
+ is_config_downloaded,
36
+ )
37
+ from pykokoro.onnx_backend import VOICE_NAMES_V1_0 as VOICE_NAMES
38
+ from pykokoro.stages.audio_generation.onnx import OnnxAudioGenerationAdapter
39
+ from pykokoro.stages.audio_postprocessing.onnx import OnnxAudioPostprocessingAdapter
40
+ from pykokoro.stages.phoneme_processing.onnx import OnnxPhonemeProcessorAdapter
41
+ from rich.progress import (
42
+ BarColumn,
43
+ Progress,
44
+ SpinnerColumn,
45
+ TaskProgressColumn,
46
+ TextColumn,
47
+ TimeElapsedColumn,
48
+ )
49
+ from rich.table import Table
50
+
51
+ from ..chapter_selection import parse_chapter_selection
52
+ from ..constants import (
53
+ DEFAULT_CONFIG,
54
+ DEFAULT_VOICE_FOR_LANG,
55
+ LANGUAGE_DESCRIPTIONS,
56
+ VOICE_PREFIX_TO_LANG,
57
+ VOICES,
58
+ )
59
+ from ..utils import format_size, load_config, reset_config, save_config
60
+ from .helpers import DEMO_TEXT, VOICE_BLEND_PRESETS, console, parse_voice_parameter
61
+
62
+ ModelSource: TypeAlias = Literal["huggingface", "github"]
63
+ ModelVariant: TypeAlias = Literal["v1.0", "v1.1-zh", "v1.1-de"]
64
+
65
+
66
+ def _require_sounddevice() -> Any:
67
+ try:
68
+ import sounddevice as sd
69
+ except ImportError:
70
+ console.print(
71
+ "[red]Error:[/red] Audio playback requires the optional dependency "
72
+ "'sounddevice'."
73
+ )
74
+ console.print(
75
+ "[yellow]Install with:[/yellow]\n"
76
+ " pip install ttsforge[audio]\n"
77
+ " pip install sounddevice"
78
+ )
79
+ raise SystemExit(1) from None
80
+ return sd
81
+
82
+
83
+ @click.command()
84
+ @click.option(
85
+ "-l",
86
+ "--language",
87
+ type=click.Choice(list(LANGUAGE_DESCRIPTIONS.keys())),
88
+ default=None,
89
+ help="Filter voices by language (default: all languages).",
90
+ )
91
+ def voices(language: str | None) -> None:
92
+ """List available TTS voices."""
93
+ table = Table(title="Available Voices")
94
+ table.add_column("Voice", style="bold")
95
+ table.add_column("Language")
96
+ table.add_column("Gender")
97
+ table.add_column("Default", style="dim")
98
+
99
+ for voice in VOICES:
100
+ prefix = voice[:2]
101
+ lang_code = VOICE_PREFIX_TO_LANG.get(prefix, "?")
102
+
103
+ if language and lang_code != language:
104
+ continue
105
+
106
+ lang_name = LANGUAGE_DESCRIPTIONS.get(lang_code, "Unknown")
107
+ gender = "Female" if prefix[1] == "f" else "Male"
108
+ is_default = "Yes" if DEFAULT_VOICE_FOR_LANG.get(lang_code) == voice else ""
109
+
110
+ table.add_row(voice, lang_name, gender, is_default)
111
+
112
+ console.print(table)
113
+
114
+
115
+ @click.command()
116
+ @click.option(
117
+ "-o",
118
+ "--output",
119
+ type=click.Path(path_type=Path),
120
+ default=None,
121
+ help="Output file path (default: ./voices_demo.wav).",
122
+ )
123
+ @click.option(
124
+ "-l",
125
+ "--language",
126
+ type=click.Choice(list(LANGUAGE_DESCRIPTIONS.keys())),
127
+ default=None,
128
+ help="Filter voices by language (default: all languages).",
129
+ )
130
+ @click.option(
131
+ "-v",
132
+ "--voice",
133
+ "voices_filter",
134
+ type=str,
135
+ default=None,
136
+ help="Specific voices to include (comma-separated, e.g., 'af_heart,am_adam').",
137
+ )
138
+ @click.option(
139
+ "-s",
140
+ "--speed",
141
+ type=float,
142
+ default=1.0,
143
+ help="Speech speed (default: 1.0).",
144
+ )
145
+ @click.option(
146
+ "--gpu/--no-gpu",
147
+ "use_gpu",
148
+ default=None,
149
+ help="Enable/disable GPU acceleration.",
150
+ )
151
+ @click.option(
152
+ "--silence",
153
+ type=float,
154
+ default=0.5,
155
+ help="Silence between voice samples in seconds (default: 0.5).",
156
+ )
157
+ @click.option(
158
+ "--text",
159
+ type=str,
160
+ default=None,
161
+ help="Custom text to use (use {voice} placeholder for voice name).",
162
+ )
163
+ @click.option(
164
+ "--separate",
165
+ is_flag=True,
166
+ help="Save each voice as a separate file instead of concatenating.",
167
+ )
168
+ @click.option(
169
+ "--blend",
170
+ type=str,
171
+ default=None,
172
+ help="Voice blend to demo (e.g., 'af_nicole:50,am_michael:50').",
173
+ )
174
+ @click.option(
175
+ "--blend-presets",
176
+ is_flag=True,
177
+ help="Demo a curated set of voice blend combinations.",
178
+ )
179
+ @click.option(
180
+ "-p",
181
+ "--play",
182
+ "play_audio",
183
+ is_flag=True,
184
+ help="Play audio directly (also saves to file if -o specified).",
185
+ )
186
+ @click.pass_context
187
+ def demo( # noqa: C901
188
+ ctx: click.Context,
189
+ output: Path | None,
190
+ language: str | None,
191
+ voices_filter: str | None,
192
+ speed: float,
193
+ use_gpu: bool | None,
194
+ silence: float,
195
+ text: str | None,
196
+ separate: bool,
197
+ blend: str | None,
198
+ blend_presets: bool,
199
+ play_audio: bool,
200
+ ) -> None:
201
+ """Generate a demo audio file with all available voices.
202
+
203
+ Creates a single audio file with samples from each voice, or separate files
204
+ for each voice with --separate. Great for previewing and comparing voices.
205
+
206
+ Supports voice blending with --blend or --blend-presets options.
207
+
208
+ Examples:
209
+
210
+ ttsforge demo
211
+
212
+ ttsforge demo -l a # Only American English voices
213
+
214
+ ttsforge demo -v af_heart,am_adam # Specific voices
215
+
216
+ ttsforge demo --separate -o ./voices/ # Separate files in directory
217
+
218
+ ttsforge demo --text "Custom message from {voice}!"
219
+
220
+ ttsforge demo --blend "af_nicole:50,am_michael:50" # Custom voice blend
221
+
222
+ ttsforge demo --blend-presets # Demo all preset voice blends
223
+
224
+ ttsforge demo --play # Play directly without saving
225
+
226
+ ttsforge demo -v af_heart --play # Play a single voice demo
227
+ """
228
+ config = load_config()
229
+ gpu = use_gpu if use_gpu is not None else config.get("use_gpu", False)
230
+ model_path = ctx.obj.get("model_path") if ctx.obj else None
231
+ voices_path = ctx.obj.get("voices_path") if ctx.obj else None
232
+
233
+ # Playback is not compatible with --separate or --blend-presets (multiple files)
234
+ if play_audio and separate:
235
+ console.print(
236
+ "[red]Error:[/red] --play is not compatible with --separate. "
237
+ "Use --play without --separate to play a combined demo."
238
+ )
239
+ sys.exit(1)
240
+ if play_audio and blend_presets:
241
+ console.print(
242
+ "[red]Error:[/red] --play is not compatible with --blend-presets. "
243
+ "Use --play with a single --blend instead."
244
+ )
245
+ sys.exit(1)
246
+
247
+ # Helper function to create filename from blend string
248
+ def blend_to_filename(blend_str: str) -> str:
249
+ """Convert blend string to filename-safe format."""
250
+ # e.g., "af_nicole:50,am_michael:50" -> "blend_af_nicole_50_am_michael_50"
251
+ parts = []
252
+ for part in blend_str.split(","):
253
+ part = part.strip()
254
+ if ":" in part:
255
+ voice_name, weight = part.split(":", 1)
256
+ parts.append(f"{voice_name.strip()}_{weight.strip()}")
257
+ else:
258
+ parts.append(part.strip())
259
+ return "blend_" + "_".join(parts)
260
+
261
+ # Handle blend modes (--blend or --blend-presets)
262
+ if blend or blend_presets:
263
+ # Collect blends to process
264
+ blends_to_process: list[tuple[str, str]] = [] # (blend_string, description)
265
+
266
+ if blend:
267
+ # Custom blend specified
268
+ blends_to_process.append((blend, f"Custom blend: {blend}"))
269
+
270
+ if blend_presets:
271
+ # Add all preset blends
272
+ blends_to_process.extend(VOICE_BLEND_PRESETS)
273
+
274
+ # For playback with single blend, we don't need an output directory
275
+ save_output = output is not None or not play_audio
276
+
277
+ if save_output:
278
+ # Determine output directory
279
+ if output is None:
280
+ output = Path("./voice_blends")
281
+ output.mkdir(parents=True, exist_ok=True)
282
+ console.print(f"[bold]Output directory:[/bold] {output}")
283
+
284
+ console.print(f"[dim]Voice blends: {len(blends_to_process)}[/dim]")
285
+ console.print(f"[dim]Speed: {speed}x[/dim]")
286
+ console.print(f"[dim]GPU: {'enabled' if gpu else 'disabled'}[/dim]")
287
+
288
+ # Initialize TTS pipeline
289
+ try:
290
+ kokoro = Kokoro(
291
+ model_path=model_path,
292
+ voices_path=voices_path,
293
+ use_gpu=gpu,
294
+ )
295
+ generation = GenerationConfig(speed=speed, lang="en-us")
296
+ pipeline_config = PipelineConfig(
297
+ voice=DEFAULT_CONFIG.get("default_voice", "af_heart"),
298
+ generation=generation,
299
+ model_path=model_path,
300
+ voices_path=voices_path,
301
+ )
302
+ pipeline = KokoroPipeline(
303
+ pipeline_config,
304
+ phoneme_processing=OnnxPhonemeProcessorAdapter(kokoro),
305
+ audio_generation=OnnxAudioGenerationAdapter(kokoro),
306
+ audio_postprocessing=OnnxAudioPostprocessingAdapter(kokoro),
307
+ )
308
+ except Exception as e:
309
+ console.print(f"[red]Error initializing TTS engine:[/red] {e}")
310
+ sys.exit(1)
311
+
312
+ sample_rate = 24000
313
+
314
+ with Progress(
315
+ SpinnerColumn(),
316
+ TextColumn("[progress.description]{task.description}"),
317
+ BarColumn(),
318
+ TaskProgressColumn(),
319
+ TimeElapsedColumn(),
320
+ console=console,
321
+ ) as progress:
322
+ task = progress.add_task(
323
+ "Generating voice blend demos...", total=len(blends_to_process)
324
+ )
325
+
326
+ for blend_str, description in blends_to_process:
327
+ try:
328
+ # Parse the blend
329
+ voice_blend = VoiceBlend.parse(blend_str)
330
+
331
+ # Create demo text describing the blend
332
+ voice_names = [v for v, _ in voice_blend.voices]
333
+ if text:
334
+ demo_text = text.format(voice=" and ".join(voice_names))
335
+ else:
336
+ voices_str = " and ".join(voice_names)
337
+ demo_text = (
338
+ f"This is a blend of {voices_str} speaking together."
339
+ )
340
+
341
+ # Generate audio with blended voice
342
+ blend_lang = VOICE_PREFIX_TO_LANG.get(voice_names[0][:2], "a")
343
+ onnx_lang = LANG_CODE_TO_ONNX.get(blend_lang, "en-us")
344
+ result = pipeline.run(demo_text, voice=voice_blend, lang=onnx_lang)
345
+ samples = result.audio
346
+ sr = result.sample_rate
347
+
348
+ # Handle playback
349
+ if play_audio:
350
+ sd = _require_sounddevice()
351
+
352
+ progress.console.print(f" [dim]Playing {description}...[/dim]")
353
+ sd.play(samples, sr)
354
+ sd.wait()
355
+ progress.console.print(
356
+ f" [green]{description}[/green]: Playback complete"
357
+ )
358
+
359
+ # Save to file if output specified
360
+ if save_output and output is not None:
361
+ import soundfile as sf
362
+
363
+ filename = blend_to_filename(blend_str) + ".wav"
364
+ voice_file = output / filename
365
+ sf.write(str(voice_file), samples, sr)
366
+ if not play_audio:
367
+ progress.console.print(
368
+ f" [green]{description}[/green]: {voice_file}"
369
+ )
370
+
371
+ except Exception as e:
372
+ console.print(f" [red]{blend_str}[/red]: Failed - {e}")
373
+
374
+ progress.advance(task)
375
+
376
+ if save_output:
377
+ num_saved = len(blends_to_process)
378
+ console.print(
379
+ f"\n[green]Saved {num_saved} voice blend demos to:[/green] {output}"
380
+ )
381
+ elif play_audio:
382
+ console.print("\n[green]Playback complete.[/green]")
383
+ return
384
+
385
+ # Regular voice demo mode (no blending)
386
+ # Determine which voices to use
387
+ selected_voices: list[str] = []
388
+
389
+ if voices_filter:
390
+ # Specific voices requested
391
+ for v in voices_filter.split(","):
392
+ v = v.strip()
393
+ if v in VOICES:
394
+ selected_voices.append(v)
395
+ else:
396
+ console.print(f"[yellow]Warning:[/yellow] Unknown voice '{v}'")
397
+ elif language:
398
+ # Filter by language
399
+ for v in VOICES:
400
+ prefix = v[:2]
401
+ lang_code = VOICE_PREFIX_TO_LANG.get(prefix, "?")
402
+ if lang_code == language:
403
+ selected_voices.append(v)
404
+ else:
405
+ # All voices
406
+ selected_voices = list(VOICES)
407
+
408
+ if not selected_voices:
409
+ console.print("[red]Error:[/red] No voices selected.")
410
+ sys.exit(1)
411
+
412
+ # Determine output path and whether to save
413
+ save_output = output is not None or not play_audio
414
+
415
+ if separate:
416
+ if output is None:
417
+ output = Path("./voice_demos")
418
+ output.mkdir(parents=True, exist_ok=True)
419
+ console.print(f"[bold]Output directory:[/bold] {output}")
420
+ elif save_output:
421
+ if output is None:
422
+ output = Path("./voices_demo.wav")
423
+ console.print(f"[bold]Output file:[/bold] {output}")
424
+
425
+ console.print(f"[dim]Voices: {len(selected_voices)}[/dim]")
426
+ console.print(f"[dim]Speed: {speed}x[/dim]")
427
+ console.print(f"[dim]GPU: {'enabled' if gpu else 'disabled'}[/dim]")
428
+
429
+ # Initialize TTS pipeline
430
+ try:
431
+ kokoro = Kokoro(
432
+ model_path=model_path,
433
+ voices_path=voices_path,
434
+ use_gpu=gpu,
435
+ )
436
+ generation = GenerationConfig(speed=speed, lang="en-us")
437
+ pipeline_config = PipelineConfig(
438
+ voice=DEFAULT_CONFIG.get("default_voice", "af_heart"),
439
+ generation=generation,
440
+ model_path=model_path,
441
+ voices_path=voices_path,
442
+ )
443
+ pipeline = KokoroPipeline(
444
+ pipeline_config,
445
+ phoneme_processing=OnnxPhonemeProcessorAdapter(kokoro),
446
+ audio_generation=OnnxAudioGenerationAdapter(kokoro),
447
+ audio_postprocessing=OnnxAudioPostprocessingAdapter(kokoro),
448
+ )
449
+ except Exception as e:
450
+ console.print(f"[red]Error initializing TTS engine:[/red] {e}")
451
+ sys.exit(1)
452
+
453
+ # Generate samples
454
+ all_samples: list[np.ndarray] = []
455
+ sample_rate = 24000 # Kokoro sample rate
456
+
457
+ # Create silence array for gaps between samples
458
+ silence_samples = np.zeros(int(silence * sample_rate), dtype=np.float32)
459
+
460
+ with Progress(
461
+ SpinnerColumn(),
462
+ TextColumn("[progress.description]{task.description}"),
463
+ BarColumn(),
464
+ TaskProgressColumn(),
465
+ TimeElapsedColumn(),
466
+ console=console,
467
+ ) as progress:
468
+ task = progress.add_task(
469
+ "Generating voice demos...", total=len(selected_voices)
470
+ )
471
+
472
+ for voice in selected_voices:
473
+ # Determine language and text for this voice
474
+ prefix = voice[:2]
475
+ lang_code = VOICE_PREFIX_TO_LANG.get(prefix, "a")
476
+
477
+ if text:
478
+ demo_text = text.format(voice=voice)
479
+ else:
480
+ demo_text = DEMO_TEXT.get(lang_code, DEMO_TEXT["a"]).format(voice=voice)
481
+
482
+ try:
483
+ onnx_lang = LANG_CODE_TO_ONNX.get(lang_code, "en-us")
484
+ result = pipeline.run(demo_text, voice=voice, lang=onnx_lang)
485
+ samples = result.audio
486
+ sr = result.sample_rate
487
+
488
+ if separate and output is not None:
489
+ # Save individual file
490
+ import soundfile as sf
491
+
492
+ voice_file = output / f"{voice}.wav"
493
+ sf.write(str(voice_file), samples, sr)
494
+ progress.console.print(f" [green]{voice}[/green]: {voice_file}")
495
+ else:
496
+ all_samples.append(samples)
497
+ if voice != selected_voices[-1]:
498
+ all_samples.append(silence_samples)
499
+
500
+ except Exception as e:
501
+ console.print(f" [red]{voice}[/red]: Failed - {e}")
502
+
503
+ progress.advance(task)
504
+
505
+ # Handle combined output (not separate mode)
506
+ if not separate and all_samples:
507
+ combined = np.concatenate(all_samples)
508
+
509
+ # Play audio if requested
510
+ if play_audio:
511
+ sd = _require_sounddevice()
512
+
513
+ console.print("[dim]Playing audio...[/dim]")
514
+ sd.play(combined, sample_rate)
515
+ sd.wait()
516
+ console.print("[green]Playback complete.[/green]")
517
+
518
+ # Save to file if output specified or not in play-only mode
519
+ if save_output and output is not None:
520
+ import soundfile as sf
521
+
522
+ sf.write(str(output), combined, sample_rate)
523
+ console.print(f"[green]Demo saved to:[/green] {output}")
524
+
525
+ # Show duration
526
+ duration_secs = len(combined) / sample_rate
527
+ mins, secs = divmod(int(duration_secs), 60)
528
+ console.print(f"[dim]Duration: {mins}m {secs}s[/dim]")
529
+ elif separate:
530
+ console.print(
531
+ f"\n[green]Saved {len(selected_voices)} voice demos to:[/green] {output}"
532
+ )
533
+
534
+
535
+ def _resolve_model_source_and_variant(cfg: dict) -> tuple[ModelSource, ModelVariant]:
536
+ """Resolve model_source/model_variant with safe defaults."""
537
+ source = str(cfg.get("model_source", DEFAULT_MODEL_SOURCE))
538
+ variant = str(cfg.get("model_variant", DEFAULT_MODEL_VARIANT))
539
+
540
+ # Keep this permissive; Kokoro/pykokoro will validate deeper.
541
+ if source not in ("huggingface", "github"):
542
+ source = DEFAULT_MODEL_SOURCE
543
+ if variant not in ("v1.0", "v1.1-zh", "v1.1-de"):
544
+ variant = DEFAULT_MODEL_VARIANT
545
+
546
+ # v1.1-de is typically GitHub-only in your backend.
547
+ if variant == "v1.1-de" and source == "huggingface":
548
+ console.print(
549
+ "[yellow]Note:[/yellow] model_variant 'v1.1-de' is not available via "
550
+ "Hugging Face in this backend. Switching model_source to 'github' "
551
+ "for the download."
552
+ )
553
+ source = "github"
554
+
555
+ return cast(ModelSource, source), cast(ModelVariant, variant)
556
+
557
+
558
+ def _get_cache_voices_path(
559
+ model_source: ModelSource,
560
+ model_variant: ModelVariant,
561
+ ) -> Path:
562
+ """Return the *actual* voices archive path used by the backend."""
563
+ voices_dir: Path = Path(
564
+ str(get_voices_dir(source=model_source, variant=model_variant))
565
+ )
566
+ if model_source == "huggingface":
567
+ return voices_dir / "voices.bin.npz"
568
+
569
+ # github: filename depends on variant
570
+ if model_variant == "v1.0":
571
+ return voices_dir / str(GITHUB_VOICES_FILENAME_V1_0)
572
+ if model_variant == "v1.1-zh":
573
+ return voices_dir / str(GITHUB_VOICES_FILENAME_V1_1_ZH)
574
+ return voices_dir / str(GITHUB_VOICES_FILENAME_V1_1_DE)
575
+
576
+
577
+ def _exists_nonempty(path: Path) -> bool:
578
+ return path.exists() and path.is_file() and path.stat().st_size > 0
579
+
580
+
581
+ def _copy_to_target(src: Path, dst: Path) -> None:
582
+ dst.parent.mkdir(parents=True, exist_ok=True)
583
+ shutil.copy2(src, dst)
584
+
585
+
586
+ @click.command()
587
+ @click.option("--force", is_flag=True, help="Force re-download even if files exist.")
588
+ @click.option(
589
+ "--quality",
590
+ "-q",
591
+ type=click.Choice(list(MODEL_QUALITY_FILES.keys())),
592
+ default=None,
593
+ help="Model quality/quantization level. Default: from config or fp32.",
594
+ )
595
+ @click.pass_context
596
+ def download(ctx: click.Context, force: bool, quality: str | None) -> None:
597
+ """Download ONNX model and voice files required for TTS.
598
+
599
+ Downloads from Hugging Face (onnx-community/Kokoro-82M-v1.0-ONNX).
600
+
601
+ Quality options:
602
+ fp32 - Full precision (326 MB) - Best quality, default
603
+ fp16 - Half precision (163 MB) - Good quality, smaller
604
+ q8 - 8-bit quantized (92 MB) - Good quality, compact
605
+ q8f16 - 8-bit with fp16 (86 MB) - Smallest file
606
+ q4 - 4-bit quantized (305 MB)
607
+ q4f16 - 4-bit with fp16 (155 MB)
608
+ uint8 - Unsigned 8-bit (177 MB)
609
+ uint8f16 - Unsigned 8-bit with fp16 (114 MB)
610
+ """
611
+ cfg = load_config()
612
+
613
+ # Get quality from config if not specified
614
+ if quality is None:
615
+ quality = cfg.get("model_quality", DEFAULT_MODEL_QUALITY)
616
+
617
+ # Cast to ModelQuality - safe because click.Choice validates input
618
+ # and config uses a valid default
619
+ model_quality = cast(ModelQuality, quality)
620
+
621
+ model_source, model_variant = _resolve_model_source_and_variant(cfg)
622
+
623
+ # Paths where pykokoro actually stores files (cache)
624
+ cache_model_dir = get_model_dir(source=model_source, variant=model_variant)
625
+ cache_model_path = get_model_path(
626
+ quality=model_quality, source=model_source, variant=model_variant
627
+ )
628
+ cache_config_path = get_config_path(variant=model_variant)
629
+ cache_voices_path = _get_cache_voices_path(model_source, model_variant)
630
+
631
+ # Optional CLI overrides (set by your root click group)
632
+ model_path_override: Path | None = None
633
+ voices_path_override: Path | None = None
634
+ if ctx.obj:
635
+ model_path_override = ctx.obj.get("model_path")
636
+ voices_path_override = ctx.obj.get("voices_path")
637
+
638
+ target_model_path = model_path_override or cache_model_path
639
+ target_voices_path = voices_path_override or cache_voices_path
640
+
641
+ console.print(f"[bold]Model source:[/bold] {model_source}")
642
+ console.print(f"[bold]Model variant:[/bold] {model_variant}")
643
+ console.print(f"[bold]Model quality:[/bold] {model_quality}")
644
+ console.print(f"[bold]Cache model dir:[/bold] {cache_model_dir}")
645
+ console.print(f"[bold]Model path:[/bold] {target_model_path}")
646
+ console.print(f"[bold]Voices path:[/bold] {target_voices_path}")
647
+ console.print(f"[bold]Config path:[/bold] {cache_config_path}")
648
+
649
+ # Check if already downloaded
650
+ already_downloaded = (
651
+ _exists_nonempty(cache_config_path)
652
+ and _exists_nonempty(target_model_path)
653
+ and _exists_nonempty(target_voices_path)
654
+ )
655
+ if already_downloaded and not force:
656
+ console.print("[green]All required files are already present.[/green]")
657
+ console.print(f" config.json: {format_size(cache_config_path.stat().st_size)}")
658
+ model_size = format_size(target_model_path.stat().st_size)
659
+ voices_size = format_size(target_voices_path.stat().st_size)
660
+ console.print(f" {target_model_path.name}: {model_size}")
661
+ console.print(f" {target_voices_path.name}: {voices_size}")
662
+ return
663
+ console.print("Downloading model assets...")
664
+ with Progress(
665
+ SpinnerColumn(),
666
+ TextColumn("[progress.description]{task.description}"),
667
+ BarColumn(),
668
+ TaskProgressColumn(),
669
+ console=console,
670
+ ) as progress:
671
+ # ---- config.json (no byte-level callback in new backend)
672
+ if not is_config_downloaded(variant=model_variant) or force:
673
+ config_task = progress.add_task("Downloading config.json...", total=1)
674
+ try:
675
+ download_config(variant=model_variant, force=force)
676
+ progress.advance(config_task)
677
+ size = format_size(cache_config_path.stat().st_size)
678
+ console.print(f" [green]config.json[/green]: {size}")
679
+ except Exception as e:
680
+ console.print(f" [red]config.json: Failed - {e}[/red]")
681
+ sys.exit(1)
682
+ else:
683
+ console.print(" [dim]config.json: already downloaded[/dim]")
684
+
685
+ # ---- model (HF/GitHub)
686
+ model_task = progress.add_task(
687
+ f"Downloading {cache_model_path.name}...", total=1
688
+ )
689
+ if not _exists_nonempty(cache_model_path) or force:
690
+ try:
691
+ if model_source == "github":
692
+ download_model_github(
693
+ variant=model_variant, quality=model_quality, force=force
694
+ )
695
+ else:
696
+ download_model(
697
+ variant=model_variant, quality=model_quality, force=force
698
+ )
699
+ progress.advance(model_task)
700
+ size = format_size(cache_model_path.stat().st_size)
701
+ console.print(f" [green]{cache_model_path.name}[/green]:{size}")
702
+ except Exception as e:
703
+ console.print(f" [red]{cache_model_path.name}: Failed - {e}[/red]")
704
+ sys.exit(1)
705
+ else:
706
+ progress.advance(model_task)
707
+ console.print(f" [dim]{cache_model_path.name}: already downloaded[/dim]")
708
+
709
+ # ---- voices
710
+ if model_source == "huggingface":
711
+ voice_names = VOICE_NAMES_BY_VARIANT.get(model_variant, VOICE_NAMES)
712
+ total_voices = len(voice_names)
713
+ voices_task = progress.add_task(
714
+ f"Downloading voices (0/{total_voices})...", total=total_voices
715
+ )
716
+
717
+ def voices_progress(voice_name: str, current: int, total: int) -> None:
718
+ # backend calls (voice_name, idx, total) *before* each voice download
719
+ shown = min(current + 1, total)
720
+ progress.update(
721
+ voices_task,
722
+ description=f"Downloading voices ({shown}/{total}) — {voice_name}",
723
+ completed=current,
724
+ )
725
+
726
+ try:
727
+ download_all_voices(
728
+ variant=model_variant,
729
+ progress_callback=voices_progress,
730
+ force=force,
731
+ )
732
+ progress.update(voices_task, completed=total_voices)
733
+ size = format_size(cache_voices_path.stat().st_size)
734
+ console.print(f" [green]{cache_voices_path.name}[/green]: {size}")
735
+ except Exception as e:
736
+ console.print(f" [red]voices: Failed - {e}[/red]")
737
+ sys.exit(1)
738
+ else:
739
+ voices_task = progress.add_task(
740
+ f"Downloading {cache_voices_path.name}...", total=1
741
+ )
742
+ if not _exists_nonempty(cache_voices_path) or force:
743
+ try:
744
+ download_voices_github(variant=model_variant, force=force)
745
+ progress.advance(voices_task)
746
+ size = format_size(cache_voices_path.stat().st_size)
747
+ console.print(f" [green]{cache_voices_path.name}[/green]: {size}")
748
+ except Exception as e:
749
+ console.print(
750
+ f" [red]{cache_voices_path.name}: Failed - {e}[/red]"
751
+ )
752
+ sys.exit(1)
753
+ else:
754
+ progress.advance(voices_task)
755
+ console.print(
756
+ f" [dim]{cache_voices_path.name}: already downloaded[/dim]"
757
+ )
758
+
759
+ # Copy to override paths if provided
760
+ # so Kokoro() sees the files where ttsforge points it.
761
+ try:
762
+ if model_path_override and cache_model_path != model_path_override:
763
+ _copy_to_target(cache_model_path, model_path_override)
764
+ console.print(f"[green]Copied model to:[/green] {model_path_override}")
765
+ if voices_path_override and cache_voices_path != voices_path_override:
766
+ _copy_to_target(cache_voices_path, voices_path_override)
767
+ console.print(f"[green]Copied voices to:[/green] {voices_path_override}")
768
+ except Exception as e:
769
+ console.print(f"[red]Error copying files to custom paths:[/red] {e}")
770
+ sys.exit(1)
771
+ console.print("\n[green]All model files downloaded successfully![/green]")
772
+
773
+
774
+ @click.command()
775
+ @click.option("--show", is_flag=True, help="Show current configuration.")
776
+ @click.option("--reset", is_flag=True, help="Reset configuration to defaults.")
777
+ @click.option(
778
+ "--set",
779
+ "set_option",
780
+ nargs=2,
781
+ multiple=True,
782
+ metavar="KEY VALUE",
783
+ help="Set a configuration option.",
784
+ )
785
+ def config(show: bool, reset: bool, set_option: tuple[tuple[str, str], ...]) -> None:
786
+ """Manage ttsforge configuration.
787
+
788
+ Configuration is stored in ~/.config/ttsforge/config.json
789
+ """
790
+ if reset:
791
+ reset_config()
792
+ console.print("[green]Configuration reset to defaults.[/green]")
793
+ return
794
+
795
+ if set_option:
796
+ current_config = load_config()
797
+ for key, value in set_option:
798
+ if key not in DEFAULT_CONFIG:
799
+ console.print(f"[yellow]Warning:[/yellow] Unknown option '{key}'")
800
+ continue
801
+
802
+ # Type conversion
803
+ default_type = type(DEFAULT_CONFIG[key])
804
+ typed_value: Any
805
+ try:
806
+ if default_type is bool:
807
+ typed_value = value.lower() in ("true", "1", "yes")
808
+ elif default_type is float:
809
+ typed_value = float(value)
810
+ elif default_type is int:
811
+ typed_value = int(value)
812
+ else:
813
+ typed_value = value
814
+
815
+ current_config[key] = typed_value
816
+ console.print(f"[green]Set {key} = {typed_value}[/green]")
817
+ except ValueError:
818
+ console.print(f"[red]Invalid value for {key}: {value}[/red]")
819
+
820
+ save_config(current_config)
821
+ return
822
+
823
+ # Show configuration
824
+ current_config = load_config()
825
+
826
+ table = Table(title="Current Configuration")
827
+ table.add_column("Option", style="bold")
828
+ table.add_column("Value")
829
+ table.add_column("Default", style="dim")
830
+
831
+ for key, default_value in DEFAULT_CONFIG.items():
832
+ current_value = current_config.get(key, default_value)
833
+ is_default = current_value == default_value
834
+ table.add_row(
835
+ key,
836
+ str(current_value),
837
+ str(default_value) if not is_default else "",
838
+ )
839
+
840
+ console.print(table)
841
+
842
+ # Show model status
843
+ cfg2 = load_config()
844
+ q = cast(ModelQuality, cfg2.get("model_quality", DEFAULT_MODEL_QUALITY))
845
+ src, var = _resolve_model_source_and_variant(cfg2)
846
+ cfg_path = get_config_path(variant=var)
847
+ mdl_path = get_model_path(quality=q, source=src, variant=var)
848
+ v_path = _get_cache_voices_path(src, var)
849
+
850
+ if (
851
+ _exists_nonempty(cfg_path)
852
+ and _exists_nonempty(mdl_path)
853
+ and _exists_nonempty(v_path)
854
+ ):
855
+ model_dir = get_model_dir(source=src, variant=var)
856
+ console.print(f"\n[bold]ONNX Models:[/bold] Downloaded ({model_dir})")
857
+ console.print(f" config.json: {cfg_path}")
858
+ console.print(f" model: {mdl_path}")
859
+ console.print(f" voices: {v_path}")
860
+ else:
861
+ console.print("\n[bold]ONNX Models:[/bold] [yellow]Not downloaded[/yellow]")
862
+ console.print("[dim]Run 'ttsforge download' to download models[/dim]")
863
+
864
+
865
+ @click.command(name="extract-names")
866
+ @click.argument(
867
+ "input_file",
868
+ type=click.Path(exists=True, path_type=Path),
869
+ )
870
+ @click.option(
871
+ "-o",
872
+ "--output",
873
+ type=click.Path(path_type=Path),
874
+ default=None,
875
+ help="Output JSON file path (default: INPUT_FILE_custom_phonemes.json).",
876
+ )
877
+ @click.option(
878
+ "--min-count",
879
+ type=int,
880
+ default=3,
881
+ help="Minimum occurrences for a name to be included (default: 3).",
882
+ )
883
+ @click.option(
884
+ "--max-names",
885
+ type=int,
886
+ default=500,
887
+ help="Maximum number of names to extract (default: 500).",
888
+ )
889
+ @click.option(
890
+ "-l",
891
+ "--language",
892
+ type=click.Choice(list(LANGUAGE_DESCRIPTIONS.keys())),
893
+ default="a",
894
+ help="Language for phoneme generation (default: a).",
895
+ )
896
+ @click.option(
897
+ "--include-all",
898
+ is_flag=True,
899
+ help="Include all detected proper nouns (ignore min-count).",
900
+ )
901
+ @click.option(
902
+ "--preview",
903
+ is_flag=True,
904
+ help="Preview extracted names without saving to file.",
905
+ )
906
+ @click.option(
907
+ "--chunk-size",
908
+ type=int,
909
+ default=100000,
910
+ help="Characters per chunk for processing (default: 100000).",
911
+ )
912
+ @click.option(
913
+ "--chapters",
914
+ type=str,
915
+ default=None,
916
+ help="Specific chapters to process (e.g., '1,3,5-10' or 'all'). Default: all.",
917
+ )
918
+ def extract_names(
919
+ input_file: Path,
920
+ output: Path | None,
921
+ min_count: int,
922
+ max_names: int,
923
+ language: str,
924
+ include_all: bool,
925
+ preview: bool,
926
+ chunk_size: int,
927
+ chapters: str | None,
928
+ ) -> None:
929
+ """Extract proper names from a book and generate phoneme dictionary.
930
+
931
+ Scans INPUT_FILE (EPUB or TXT) for proper names and creates a JSON phoneme
932
+ dictionary with auto-generated pronunciation suggestions. You can then review
933
+ and edit the suggestions before using them for TTS conversion.
934
+
935
+ Examples:
936
+
937
+ \b
938
+ # Extract names and save to default file
939
+ ttsforge extract-names mybook.epub
940
+
941
+ \b
942
+ # Preview names without saving
943
+ ttsforge extract-names mybook.epub --preview
944
+
945
+ \b
946
+ # Extract frequent names only (10+ occurrences)
947
+ ttsforge extract-names mybook.epub --min-count 10 -o names.json
948
+
949
+ \b
950
+ # Extract from specific chapters
951
+ ttsforge extract-names mybook.epub --chapters 1,3,5-10
952
+
953
+ \b
954
+ # Extract from chapter range
955
+ ttsforge extract-names mybook.epub --start 5 --end 15
956
+
957
+ \b
958
+ # Then use the dictionary for conversion
959
+ ttsforge convert mybook.epub --phoneme-dict custom_phonemes.json
960
+ """
961
+ from rich.table import Table
962
+
963
+ from ..input_reader import InputReader
964
+ from ..name_extractor import (
965
+ extract_names_from_text,
966
+ generate_phoneme_suggestions,
967
+ save_phoneme_dictionary,
968
+ )
969
+
970
+ # Set default output filename
971
+ if output is None:
972
+ output = input_file.with_name(f"{input_file.stem}_custom_phonemes.json")
973
+
974
+ console.print(f"[bold]Extracting names from:[/bold] {input_file}")
975
+
976
+ # Read file content
977
+ try:
978
+ reader = InputReader(input_file)
979
+ all_chapters = reader.get_chapters()
980
+
981
+ # Determine which chapters to process
982
+ if chapters is not None:
983
+ # Parse chapter selection (supports 'all', ranges, and specific chapters)
984
+ try:
985
+ selected_indices = parse_chapter_selection(chapters, len(all_chapters))
986
+ selected_chapters = [all_chapters[i] for i in selected_indices]
987
+ except ValueError as exc:
988
+ console.print(f"[yellow]{exc}[/yellow]")
989
+ sys.exit(1)
990
+
991
+ if len(selected_chapters) < len(all_chapters):
992
+ console.print(
993
+ f"[dim]Processing {len(selected_chapters)} of "
994
+ f"{len(all_chapters)} chapters[/dim]"
995
+ )
996
+ else:
997
+ # Use all chapters by default
998
+ selected_chapters = all_chapters
999
+
1000
+ # Remove chapter markers before joining text
1001
+ text = "\n\n".join(
1002
+ re.sub(
1003
+ r"^\s*<<CHAPTER:[^>]*>>\s*\n*",
1004
+ "",
1005
+ chapter.text,
1006
+ count=1,
1007
+ flags=re.MULTILINE,
1008
+ )
1009
+ for chapter in selected_chapters
1010
+ )
1011
+
1012
+ except Exception as e:
1013
+ console.print(f"[red]Error loading file:[/red] {e}")
1014
+ raise SystemExit(1) from None
1015
+
1016
+ # Check if spaCy is available
1017
+ try:
1018
+ import spacy # noqa: F401
1019
+ except ImportError:
1020
+ console.print(
1021
+ "[red]Error:[/red] spaCy is required for name extraction.\n"
1022
+ "[yellow]Install with:[/yellow]\n"
1023
+ " pip install spacy\n"
1024
+ " python -m spacy download en_core_web_sm"
1025
+ )
1026
+ raise SystemExit(1) from None
1027
+
1028
+ # Extract names
1029
+ with Progress(
1030
+ SpinnerColumn(),
1031
+ TextColumn("[progress.description]{task.description}"),
1032
+ console=console,
1033
+ ) as progress:
1034
+ task = progress.add_task("Analyzing text and extracting names...", total=None)
1035
+
1036
+ # Progress callback to update progress bar
1037
+ def update_progress(current: int, total: int) -> None:
1038
+ progress.update(
1039
+ task,
1040
+ description=f"Processing chunk {current}/{total}...",
1041
+ completed=current,
1042
+ total=total,
1043
+ )
1044
+
1045
+ try:
1046
+ names = extract_names_from_text(
1047
+ text,
1048
+ min_count=min_count,
1049
+ max_names=max_names,
1050
+ include_all=include_all,
1051
+ chunk_size=chunk_size,
1052
+ progress_callback=update_progress,
1053
+ )
1054
+ except ImportError as e:
1055
+ console.print(f"[red]Error:[/red] {e}")
1056
+ raise SystemExit(1) from None
1057
+
1058
+ if not names:
1059
+ console.print(
1060
+ f"[yellow]No names found[/yellow] (min_count={min_count}). "
1061
+ f"Try lowering --min-count or using --include-all."
1062
+ )
1063
+ return
1064
+
1065
+ console.print(f"\n[green]Found {len(names)} proper names[/green]\n")
1066
+
1067
+ # Generate phoneme suggestions
1068
+ with Progress(
1069
+ SpinnerColumn(),
1070
+ TextColumn("[progress.description]{task.description}"),
1071
+ console=console,
1072
+ ) as progress:
1073
+ progress.add_task("Generating phoneme suggestions...", total=None)
1074
+ suggestions = generate_phoneme_suggestions(names, language)
1075
+
1076
+ # Display results in a table
1077
+ table = Table(title=f"Extracted Names (≥{min_count} occurrences)")
1078
+ table.add_column("Name", style="cyan", no_wrap=True)
1079
+ table.add_column("Count", justify="right", style="magenta")
1080
+ table.add_column("Suggested Phoneme", style="green")
1081
+
1082
+ for name in sorted(names.keys(), key=lambda n: names[n], reverse=True):
1083
+ entry = suggestions[name]
1084
+ phoneme = entry["phoneme"]
1085
+ count = entry["occurrences"]
1086
+
1087
+ # Highlight errors
1088
+ if entry.get("suggestion_quality") == "error":
1089
+ phoneme_display = f"[red]{phoneme}[/red]"
1090
+ else:
1091
+ phoneme_display = phoneme
1092
+
1093
+ table.add_row(name, str(count), phoneme_display)
1094
+
1095
+ console.print(table)
1096
+
1097
+ # Save or preview
1098
+ if preview:
1099
+ console.print("\n[dim]Preview mode - no file saved.[/dim]")
1100
+ console.print("[dim]To save, run without --preview flag.[/dim]")
1101
+ else:
1102
+ save_phoneme_dictionary(
1103
+ suggestions, output, source_file=str(input_file.name), language=language
1104
+ )
1105
+ console.print(f"\n[green]✓ Saved to:[/green] {output}")
1106
+ console.print(
1107
+ "\n[dim]Next steps:[/dim]\n"
1108
+ f" 1. Review and edit {output} to fix any incorrect phonemes\n"
1109
+ f" 2. Use with: [cyan]ttsforge convert {input_file} "
1110
+ f"--phoneme-dict {output}[/cyan]"
1111
+ )
1112
+
1113
+
1114
+ @click.command(name="list-names")
1115
+ @click.argument(
1116
+ "phoneme_dict",
1117
+ type=click.Path(exists=True, path_type=Path),
1118
+ )
1119
+ @click.option(
1120
+ "--sort-by",
1121
+ type=click.Choice(["name", "count", "alpha"]),
1122
+ default="count",
1123
+ help="Sort by: name (same as alpha), count (occurrences), alpha (alphabetical).",
1124
+ )
1125
+ @click.option(
1126
+ "--play",
1127
+ is_flag=True,
1128
+ help="Play audio preview for each name (interactive mode).",
1129
+ )
1130
+ @click.option(
1131
+ "-v",
1132
+ "--voice",
1133
+ type=str,
1134
+ default="af_sky",
1135
+ help="Voice to use for audio preview (default: af_sky).",
1136
+ )
1137
+ @click.option(
1138
+ "-l",
1139
+ "--language",
1140
+ type=str,
1141
+ default="a",
1142
+ help=(
1143
+ "Language code for audio preview "
1144
+ "(e.g., 'de', 'en-us', 'a' for auto, default: a)."
1145
+ ),
1146
+ )
1147
+ def list_names( # noqa: C901
1148
+ phoneme_dict: Path, sort_by: str, play: bool, voice: str, language: str
1149
+ ) -> None:
1150
+ """List all names in a phoneme dictionary for review.
1151
+
1152
+ Displays the contents of a phoneme dictionary in a readable table format,
1153
+ making it easy to review and identify names that need phoneme corrections.
1154
+
1155
+ Use --play to interactively listen to each name pronunciation.
1156
+
1157
+ Examples:
1158
+
1159
+ \b
1160
+ # List names sorted by frequency
1161
+ ttsforge list-names custom_phonemes.json
1162
+
1163
+ \b
1164
+ # List names alphabetically
1165
+ ttsforge list-names custom_phonemes.json --sort-by alpha
1166
+
1167
+ \b
1168
+ # Interactive audio preview
1169
+ ttsforge list-names custom_phonemes.json --play
1170
+
1171
+ \b
1172
+ # Audio preview with different voice and language
1173
+ ttsforge list-names custom_phonemes.json --play --voice af_bella --language de
1174
+ """
1175
+ import json
1176
+
1177
+ from rich.table import Table
1178
+
1179
+ from ..conversion import ConversionOptions, TTSConverter
1180
+
1181
+ # Load dictionary
1182
+ try:
1183
+ with open(phoneme_dict, encoding="utf-8") as f:
1184
+ data = json.load(f)
1185
+ except Exception as e:
1186
+ console.print(f"[red]Error loading dictionary:[/red] {e}")
1187
+ raise SystemExit(1) from None
1188
+
1189
+ # Parse dictionary format
1190
+ if "entries" in data:
1191
+ # Metadata format
1192
+ entries = data["entries"]
1193
+ metadata = data.get("_metadata", {})
1194
+ else:
1195
+ # Simple format
1196
+ entries = data
1197
+ metadata = {}
1198
+
1199
+ if not entries:
1200
+ console.print("[yellow]Dictionary is empty.[/yellow]")
1201
+ return
1202
+
1203
+ # Show metadata if available
1204
+ if metadata:
1205
+ console.print("[dim]Dictionary info:[/dim]")
1206
+ if "generated_from" in metadata:
1207
+ console.print(f" Generated from: {metadata['generated_from']}")
1208
+ if "generated_at" in metadata:
1209
+ console.print(f" Generated at: {metadata['generated_at']}")
1210
+ if "language" in metadata:
1211
+ console.print(f" Language: {metadata['language']}")
1212
+ console.print()
1213
+
1214
+ # Create table
1215
+ table = Table(title=f"Phoneme Dictionary: {phoneme_dict.name}")
1216
+ table.add_column("Name", style="cyan", no_wrap=True)
1217
+ table.add_column("Phoneme", style="green")
1218
+ table.add_column("Count", justify="right", style="magenta")
1219
+ table.add_column("Status", style="yellow")
1220
+
1221
+ # Sort entries
1222
+ items = list(entries.items())
1223
+ if sort_by in ["name", "alpha"]:
1224
+ items.sort(key=lambda x: x[0].lower())
1225
+ elif sort_by == "count":
1226
+ items.sort(
1227
+ key=lambda x: (x[1].get("occurrences", 0) if isinstance(x[1], dict) else 0),
1228
+ reverse=True,
1229
+ )
1230
+
1231
+ # Add rows
1232
+ for name, value in items:
1233
+ if isinstance(value, str):
1234
+ # Simple format
1235
+ phoneme = value
1236
+ count = "-"
1237
+ status = "manual"
1238
+ else:
1239
+ # Metadata format
1240
+ phoneme = value.get("phoneme", "-")
1241
+ count = str(value.get("occurrences", "-"))
1242
+
1243
+ # Determine status
1244
+ if value.get("verified"):
1245
+ status = "✓ verified"
1246
+ elif value.get("suggestion_quality") == "error":
1247
+ status = "⚠ error"
1248
+ elif value.get("suggestion_quality") == "auto":
1249
+ status = "auto"
1250
+ else:
1251
+ status = "manual"
1252
+
1253
+ # Highlight issues
1254
+ if phoneme == "/FIXME/":
1255
+ phoneme = "[red]/FIXME/[/red]"
1256
+ status = "[red]needs fix[/red]"
1257
+
1258
+ table.add_row(name, phoneme, count, status)
1259
+
1260
+ console.print(table)
1261
+ console.print(f"\n[green]Total entries:[/green] {len(entries)}")
1262
+
1263
+ # Interactive audio preview mode
1264
+ if play:
1265
+ console.print("\n[bold]Audio Preview Mode[/bold]")
1266
+ console.print(
1267
+ "[dim]Press Enter to play each name, or type a number to jump "
1268
+ "to that entry.[/dim]"
1269
+ )
1270
+ console.print("[dim]Type 'q' to quit, 's' to skip, 'r' to replay.[/dim]\n")
1271
+
1272
+ # Initialize converter with phoneme dictionary
1273
+ try:
1274
+ # Auto-detect if voice is a blend
1275
+ parsed_voice, parsed_voice_blend = parse_voice_parameter(voice)
1276
+
1277
+ options = ConversionOptions(
1278
+ phoneme_dictionary_path=str(phoneme_dict),
1279
+ voice=parsed_voice or "af_sky",
1280
+ voice_blend=parsed_voice_blend,
1281
+ language=language,
1282
+ )
1283
+ converter = TTSConverter(options)
1284
+
1285
+ idx = 0
1286
+ while idx < len(items):
1287
+ name, value = items[idx]
1288
+
1289
+ if isinstance(value, str):
1290
+ phoneme = value
1291
+ else:
1292
+ phoneme = value.get("phoneme", "")
1293
+
1294
+ console.print(
1295
+ f"\n[cyan]{idx + 1}/{len(items)}:[/cyan] "
1296
+ f"[bold]{name}[/bold] → [green]{phoneme}[/green]"
1297
+ )
1298
+
1299
+ # Get user input
1300
+ user_input = (
1301
+ input(" [Enter=play, 's'=skip, 'r'=replay, 'q'=quit]: ")
1302
+ .strip()
1303
+ .lower()
1304
+ )
1305
+
1306
+ if user_input == "q":
1307
+ console.print("[dim]Exiting preview mode.[/dim]")
1308
+ break
1309
+ elif user_input == "s":
1310
+ idx += 1
1311
+ continue
1312
+ elif user_input.isdigit():
1313
+ # Jump to specific entry
1314
+ target_idx = int(user_input) - 1
1315
+ if 0 <= target_idx < len(items):
1316
+ idx = target_idx
1317
+ console.print(f"[dim]Jumping to entry {user_input}...[/dim]")
1318
+ continue
1319
+ else:
1320
+ console.print(
1321
+ f"[yellow]Invalid entry number. "
1322
+ f"Must be 1-{len(items)}[/yellow]"
1323
+ )
1324
+ continue
1325
+
1326
+ # Play audio (Enter or 'r')
1327
+ try:
1328
+ # Create a test sentence with the name
1329
+ test_text = f"The name {name} appears in the story."
1330
+
1331
+ # Create temp file
1332
+ with tempfile.NamedTemporaryFile(
1333
+ suffix=".wav", delete=False
1334
+ ) as tmp:
1335
+ temp_output = Path(tmp.name)
1336
+
1337
+ try:
1338
+ with console.status(f"Generating audio for '{name}'..."):
1339
+ result = converter.convert_text(test_text, temp_output)
1340
+
1341
+ if result.success:
1342
+ # Play the audio
1343
+ import soundfile as sf
1344
+
1345
+ audio_data, sample_rate = sf.read(str(temp_output))
1346
+ console.print("[dim]▶ Playing...[/dim]")
1347
+ sd = _require_sounddevice()
1348
+ sd.play(audio_data, sample_rate)
1349
+ sd.wait()
1350
+ console.print("[green]✓ Done[/green]")
1351
+ else:
1352
+ console.print(f"[red]Error:[/red] {result.error_message}")
1353
+
1354
+ finally:
1355
+ # Cleanup temp file
1356
+ if temp_output.exists():
1357
+ temp_output.unlink()
1358
+
1359
+ # Don't auto-advance on 'r' (replay)
1360
+ if user_input != "r":
1361
+ idx += 1
1362
+
1363
+ except Exception as e:
1364
+ console.print(f"[red]Error playing audio:[/red] {e}")
1365
+ idx += 1
1366
+ continue
1367
+
1368
+ except Exception as e:
1369
+ console.print(f"[red]Error initializing audio preview:[/red] {e}")
1370
+ console.print("[yellow]Make sure you have the TTS model loaded.[/yellow]")
1371
+
1372
+ # Show suggestions
1373
+ needs_review = sum(
1374
+ 1
1375
+ for entry in entries.values()
1376
+ if isinstance(entry, dict)
1377
+ and entry.get("suggestion_quality") == "auto"
1378
+ and not entry.get("verified")
1379
+ )
1380
+
1381
+ if needs_review > 0 and not play:
1382
+ console.print(
1383
+ f"\n[yellow]⚠ {needs_review} entries need review[/yellow] "
1384
+ f"(auto-generated, not verified)"
1385
+ )
1386
+ console.print(
1387
+ f"\n[dim]Tip:[/dim] Listen to samples with:\n"
1388
+ f" [cyan]ttsforge list-names {phoneme_dict} --play[/cyan]"
1389
+ )