ttsforge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ttsforge/__init__.py +114 -0
- ttsforge/_version.py +34 -0
- ttsforge/audio_merge.py +180 -0
- ttsforge/audio_player.py +473 -0
- ttsforge/chapter_selection.py +75 -0
- ttsforge/cli/__init__.py +73 -0
- ttsforge/cli/commands_conversion.py +1927 -0
- ttsforge/cli/commands_phonemes.py +1033 -0
- ttsforge/cli/commands_utility.py +1389 -0
- ttsforge/cli/helpers.py +76 -0
- ttsforge/constants.py +164 -0
- ttsforge/conversion.py +1090 -0
- ttsforge/input_reader.py +408 -0
- ttsforge/kokoro_lang.py +12 -0
- ttsforge/kokoro_runner.py +125 -0
- ttsforge/name_extractor.py +305 -0
- ttsforge/phoneme_conversion.py +978 -0
- ttsforge/phonemes.py +486 -0
- ttsforge/ssmd_generator.py +422 -0
- ttsforge/utils.py +785 -0
- ttsforge/vocab/__init__.py +139 -0
- ttsforge-0.1.0.dist-info/METADATA +659 -0
- ttsforge-0.1.0.dist-info/RECORD +27 -0
- ttsforge-0.1.0.dist-info/WHEEL +5 -0
- ttsforge-0.1.0.dist-info/entry_points.txt +2 -0
- ttsforge-0.1.0.dist-info/licenses/LICENSE +21 -0
- ttsforge-0.1.0.dist-info/top_level.txt +1 -0
ttsforge/cli/helpers.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Shared utilities and helpers for CLI commands."""
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
|
|
5
|
+
# Shared console instance for all CLI commands
|
|
6
|
+
console = Console()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_voice_parameter(voice: str) -> tuple[str | None, str | None]:
|
|
10
|
+
"""Parse voice parameter to detect if it's a single voice or a blend.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
voice: Voice parameter (e.g., 'af_sky' or 'af_nicole:50,am_michael:50')
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Tuple of (voice, voice_blend) where one will be None
|
|
17
|
+
|
|
18
|
+
Examples:
|
|
19
|
+
>>> parse_voice_parameter('af_sky')
|
|
20
|
+
('af_sky', None)
|
|
21
|
+
>>> parse_voice_parameter('af_nicole:50,am_michael:50')
|
|
22
|
+
(None, 'af_nicole:50,am_michael:50')
|
|
23
|
+
"""
|
|
24
|
+
# Detect if it's a blend (contains both : and ,)
|
|
25
|
+
if ":" in voice and "," in voice:
|
|
26
|
+
return (None, voice)
|
|
27
|
+
else:
|
|
28
|
+
return (voice, None)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_version() -> str:
|
|
32
|
+
"""Get the package version."""
|
|
33
|
+
try:
|
|
34
|
+
from .._version import version
|
|
35
|
+
|
|
36
|
+
return version
|
|
37
|
+
except ImportError:
|
|
38
|
+
return "0.0.0+unknown"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Default sample text for testing TTS settings
|
|
42
|
+
DEFAULT_SAMPLE_TEXT = (
|
|
43
|
+
"The quick brown fox jumps over the lazy dog. "
|
|
44
|
+
"This sample text demonstrates the text-to-speech capabilities, "
|
|
45
|
+
"including punctuation handling, and natural speech flow."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Demo sample text per language
|
|
50
|
+
DEMO_TEXT = {
|
|
51
|
+
"a": "Hello! This audio was generated by {voice}. How do you like it?",
|
|
52
|
+
"b": "Hello! This audio was generated by {voice}. How do you like it?",
|
|
53
|
+
"e": "Hola! Este audio fue generado por {voice}. Que te parece?",
|
|
54
|
+
"f": "Bonjour! Cet audio a ete genere par {voice}. Comment le trouvez-vous?",
|
|
55
|
+
"h": "Namaste! Yah audio {voice} dwara banaya gaya hai. Aapko kaisa laga?",
|
|
56
|
+
"i": "Ciao! Questo audio e stato generato da {voice}. Ti piace?",
|
|
57
|
+
"j": "Konnichiwa! Kono onsei wa {voice} ni yotte sakusei saremashita.",
|
|
58
|
+
"p": "Ola! Este audio foi gerado por {voice}. O que voce achou?",
|
|
59
|
+
"z": "Ni hao! Zhe ge yinpin shi you {voice} shengcheng de.",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# Preset voice blends for demo command
|
|
63
|
+
# Format: (blend_string, description)
|
|
64
|
+
VOICE_BLEND_PRESETS = [
|
|
65
|
+
# Same language, different gender
|
|
66
|
+
("af_nicole:50,am_michael:50", "American female + male blend"),
|
|
67
|
+
("bf_emma:50,bm_george:50", "British female + male blend"),
|
|
68
|
+
# Same gender, different accent
|
|
69
|
+
("af_heart:50,bf_emma:50", "American + British female blend"),
|
|
70
|
+
("am_adam:50,bm_daniel:50", "American + British male blend"),
|
|
71
|
+
# Same gender, different voice
|
|
72
|
+
("af_nicole:50,af_bella:50", "Two American females blend"),
|
|
73
|
+
("am_adam:50,am_eric:50", "Two American males blend"),
|
|
74
|
+
# Multi-voice blend
|
|
75
|
+
("af_heart:33,af_nicole:33,af_bella:34", "Three American females blend"),
|
|
76
|
+
]
|
ttsforge/constants.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Constants for ttsforge - voices, languages, and formats."""
|
|
2
|
+
|
|
3
|
+
# from pykokoro.onnx_backend import VOICE_NAMES_V1_0
|
|
4
|
+
# from pykokoro.onnx_backend import VOICE_NAMES_V1_1_ZH, VOICE_NAMES_V1_1_DE
|
|
5
|
+
|
|
6
|
+
from pykokoro.onnx_backend import VOICE_NAMES_V1_0 as VOICE_NAMES
|
|
7
|
+
|
|
8
|
+
# Re-export from pykokoro for convenience
|
|
9
|
+
VOICES = VOICE_NAMES
|
|
10
|
+
|
|
11
|
+
# Audio constants from pykokoro
|
|
12
|
+
try:
|
|
13
|
+
from pykokoro.constants import SAMPLE_RATE as _SAMPLE_RATE
|
|
14
|
+
|
|
15
|
+
SAMPLE_RATE: int = int(_SAMPLE_RATE)
|
|
16
|
+
except ImportError:
|
|
17
|
+
SAMPLE_RATE = 24000 # Fallback value
|
|
18
|
+
|
|
19
|
+
# Program Information
|
|
20
|
+
PROGRAM_NAME = "ttsforge"
|
|
21
|
+
PROGRAM_DESCRIPTION = "Generate audiobooks from EPUB files using Kokoro ONNX TTS."
|
|
22
|
+
|
|
23
|
+
# Language code to description mapping
|
|
24
|
+
LANGUAGE_DESCRIPTIONS = {
|
|
25
|
+
"a": "American English",
|
|
26
|
+
"b": "British English",
|
|
27
|
+
"e": "Spanish",
|
|
28
|
+
"f": "French",
|
|
29
|
+
"h": "Hindi",
|
|
30
|
+
"i": "Italian",
|
|
31
|
+
"j": "Japanese",
|
|
32
|
+
"p": "Brazilian Portuguese",
|
|
33
|
+
"z": "Mandarin Chinese",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# ISO language code to ttsforge language code mapping
|
|
37
|
+
ISO_TO_LANG_CODE = {
|
|
38
|
+
"en": "a", # Default to American English
|
|
39
|
+
"en-us": "a",
|
|
40
|
+
"en-gb": "b",
|
|
41
|
+
"en-au": "b",
|
|
42
|
+
"es": "e",
|
|
43
|
+
"es-es": "e",
|
|
44
|
+
"es-mx": "e",
|
|
45
|
+
"fr": "f",
|
|
46
|
+
"fr-fr": "f",
|
|
47
|
+
"fr-ca": "f",
|
|
48
|
+
"hi": "h",
|
|
49
|
+
"it": "i",
|
|
50
|
+
"ja": "j",
|
|
51
|
+
"pt": "p",
|
|
52
|
+
"pt-br": "p",
|
|
53
|
+
"pt-pt": "p",
|
|
54
|
+
"zh": "z",
|
|
55
|
+
"zh-cn": "z",
|
|
56
|
+
"zh-tw": "z",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Voice prefix to language code mapping
|
|
60
|
+
VOICE_PREFIX_TO_LANG = {
|
|
61
|
+
"af": "a", # American Female
|
|
62
|
+
"am": "a", # American Male
|
|
63
|
+
"bf": "b", # British Female
|
|
64
|
+
"bm": "b", # British Male
|
|
65
|
+
"ef": "e", # Spanish Female
|
|
66
|
+
"em": "e", # Spanish Male
|
|
67
|
+
"ff": "f", # French Female
|
|
68
|
+
"fm": "f", # French Male
|
|
69
|
+
"hf": "h", # Hindi Female
|
|
70
|
+
"hm": "h", # Hindi Male
|
|
71
|
+
"if": "i", # Italian Female
|
|
72
|
+
"im": "i", # Italian Male
|
|
73
|
+
"jf": "j", # Japanese Female
|
|
74
|
+
"jm": "j", # Japanese Male
|
|
75
|
+
"pf": "p", # Portuguese Female
|
|
76
|
+
"pm": "p", # Portuguese Male
|
|
77
|
+
"zf": "z", # Chinese Female
|
|
78
|
+
"zm": "z", # Chinese Male
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Language code to default voice mapping
|
|
82
|
+
DEFAULT_VOICE_FOR_LANG = {
|
|
83
|
+
"a": "af_heart",
|
|
84
|
+
"b": "bf_emma",
|
|
85
|
+
"e": "ef_dora",
|
|
86
|
+
"f": "ff_siwis",
|
|
87
|
+
"h": "hf_alpha",
|
|
88
|
+
"i": "if_sara",
|
|
89
|
+
"j": "jf_alpha",
|
|
90
|
+
"p": "pf_dora",
|
|
91
|
+
"z": "zf_xiaoxiao",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Supported output audio formats
|
|
95
|
+
SUPPORTED_OUTPUT_FORMATS = [
|
|
96
|
+
"wav",
|
|
97
|
+
"mp3",
|
|
98
|
+
"flac",
|
|
99
|
+
"opus",
|
|
100
|
+
"m4b",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
# Formats that require ffmpeg
|
|
104
|
+
FFMPEG_FORMATS = ["m4b", "opus"]
|
|
105
|
+
|
|
106
|
+
# Formats supported by soundfile directly
|
|
107
|
+
SOUNDFILE_FORMATS = ["wav", "mp3", "flac"]
|
|
108
|
+
|
|
109
|
+
# Default configuration values
|
|
110
|
+
DEFAULT_CONFIG = {
|
|
111
|
+
"default_voice": "af_heart",
|
|
112
|
+
"default_language": "a",
|
|
113
|
+
"default_speed": 1.0,
|
|
114
|
+
"default_format": "m4b",
|
|
115
|
+
"use_gpu": False, # GPU requires onnxruntime-gpu
|
|
116
|
+
# Model quality: fp32, fp16, q8, q8f16, q4, q4f16, uint8, uint8f16
|
|
117
|
+
"model_quality": "fp32",
|
|
118
|
+
"model_variant": "v1.0",
|
|
119
|
+
"silence_between_chapters": 2.0,
|
|
120
|
+
"save_chapters_separately": False,
|
|
121
|
+
"merge_at_end": True,
|
|
122
|
+
"auto_detect_language": True,
|
|
123
|
+
"default_split_mode": "auto",
|
|
124
|
+
"default_content_mode": "chapters", # Content mode for read: chapters or pages
|
|
125
|
+
"default_page_size": 2000, # Synthetic page size in characters for pages mode
|
|
126
|
+
"pause_clause": 0.5,
|
|
127
|
+
"pause_sentence": 0.7,
|
|
128
|
+
"pause_paragraph": 0.9,
|
|
129
|
+
"pause_variance": 0.05,
|
|
130
|
+
"pause_mode": "auto", # "tts", "manual", or "auto
|
|
131
|
+
# Language override for phonemization (e.g., 'de', 'fr', 'en-us')
|
|
132
|
+
# If None, language is determined from voice prefix
|
|
133
|
+
"phonemization_lang": None,
|
|
134
|
+
# Chapter announcement settings
|
|
135
|
+
"announce_chapters": True, # Read chapter titles aloud before content
|
|
136
|
+
"chapter_pause_after_title": 2.0, # Pause after chapter title (seconds)
|
|
137
|
+
"output_filename_template": "{book_title}",
|
|
138
|
+
"chapter_filename_template": "{chapter_num:03d}_{book_title}_{chapter_title}",
|
|
139
|
+
"phoneme_export_template": "{book_title}",
|
|
140
|
+
# Fallback title when metadata is missing
|
|
141
|
+
"default_title": "Untitled",
|
|
142
|
+
# Mixed-language phonemization settings (disabled by default)
|
|
143
|
+
"use_mixed_language": False, # Enable automatic language detection
|
|
144
|
+
"mixed_language_primary": None, # Primary language (None = use current lang)
|
|
145
|
+
"mixed_language_allowed": None, # List of allowed languages (required if enabled)
|
|
146
|
+
"mixed_language_confidence": 0.7, # Detection confidence threshold (0.0-1.0)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Audio settings
|
|
150
|
+
# SAMPLE_RATE is imported from pykokoro at top of file
|
|
151
|
+
AUDIO_CHANNELS = 1
|
|
152
|
+
|
|
153
|
+
# Sample texts for voice preview (per language)
|
|
154
|
+
SAMPLE_TEXTS = {
|
|
155
|
+
"a": "This is a sample of the selected voice.",
|
|
156
|
+
"b": "This is a sample of the selected voice.",
|
|
157
|
+
"e": "Este es una muestra de la voz seleccionada.",
|
|
158
|
+
"f": "Ceci est un exemple de la voix sélectionnée.",
|
|
159
|
+
"h": "यह चयनित आवाज़ का एक नमूना है।", # noqa: E501
|
|
160
|
+
"i": "Questo è un esempio della voce selezionata.",
|
|
161
|
+
"j": "これは選択した声のサンプルです。", # noqa: E501
|
|
162
|
+
"p": "Este é um exemplo da voz selecionada.",
|
|
163
|
+
"z": "这是所选语音的示例。",
|
|
164
|
+
}
|