ttsforge 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ttsforge/__init__.py +3 -18
- ttsforge/_version.py +2 -2
- ttsforge/cli/commands_conversion.py +69 -3
- ttsforge/cli/commands_phonemes.py +19 -1
- ttsforge/cli/commands_utility.py +18 -1
- ttsforge/cli/helpers.py +1 -0
- ttsforge/constants.py +11 -2
- ttsforge/conversion.py +103 -42
- ttsforge/kokoro_runner.py +38 -5
- ttsforge/phoneme_conversion.py +52 -1
- {ttsforge-0.1.1.dist-info → ttsforge-0.1.2.dist-info}/METADATA +2 -1
- ttsforge-0.1.2.dist-info/RECORD +27 -0
- ttsforge-0.1.1.dist-info/RECORD +0 -27
- {ttsforge-0.1.1.dist-info → ttsforge-0.1.2.dist-info}/WHEEL +0 -0
- {ttsforge-0.1.1.dist-info → ttsforge-0.1.2.dist-info}/entry_points.txt +0 -0
- {ttsforge-0.1.1.dist-info → ttsforge-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {ttsforge-0.1.1.dist-info → ttsforge-0.1.2.dist-info}/top_level.txt +0 -0
ttsforge/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ from pykokoro.tokenizer import (
|
|
|
18
18
|
Tokenizer,
|
|
19
19
|
)
|
|
20
20
|
from pykokoro.constants import SUPPORTED_LANGUAGES
|
|
21
|
-
|
|
21
|
+
from pykokoro.onnx_backend import VOICE_NAMES_BY_VARIANT
|
|
22
22
|
from .constants import (
|
|
23
23
|
DEFAULT_CONFIG,
|
|
24
24
|
LANGUAGE_DESCRIPTIONS,
|
|
@@ -27,23 +27,7 @@ from .constants import (
|
|
|
27
27
|
)
|
|
28
28
|
|
|
29
29
|
# Import from pykokoro
|
|
30
|
-
|
|
31
|
-
from pykokoro.constants import SAMPLE_RATE
|
|
32
|
-
from pykokoro.onnx_backend import LANG_CODE_TO_ONNX
|
|
33
|
-
except ImportError:
|
|
34
|
-
# Fallback values if pykokoro not installed
|
|
35
|
-
SAMPLE_RATE = 24000
|
|
36
|
-
LANG_CODE_TO_ONNX = {
|
|
37
|
-
"a": "en-us",
|
|
38
|
-
"b": "en-gb",
|
|
39
|
-
"e": "es",
|
|
40
|
-
"f": "fr-fr",
|
|
41
|
-
"h": "hi",
|
|
42
|
-
"i": "it",
|
|
43
|
-
"j": "ja",
|
|
44
|
-
"p": "pt",
|
|
45
|
-
"z": "zh",
|
|
46
|
-
}
|
|
30
|
+
from pykokoro.constants import SAMPLE_RATE
|
|
47
31
|
|
|
48
32
|
from .conversion import (
|
|
49
33
|
Chapter,
|
|
@@ -73,6 +57,7 @@ __all__ = [
|
|
|
73
57
|
"LANGUAGE_DESCRIPTIONS",
|
|
74
58
|
"SUPPORTED_OUTPUT_FORMATS",
|
|
75
59
|
"VOICES",
|
|
60
|
+
"VOICE_NAMES_BY_VARIANT",
|
|
76
61
|
# Conversion
|
|
77
62
|
"Chapter",
|
|
78
63
|
"ConversionOptions",
|
ttsforge/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -17,6 +17,7 @@ from typing import Literal, TypedDict, cast
|
|
|
17
17
|
|
|
18
18
|
import click
|
|
19
19
|
import numpy as np
|
|
20
|
+
from pykokoro.onnx_backend import DEFAULT_MODEL_QUALITY, ModelQuality
|
|
20
21
|
from rich.panel import Panel
|
|
21
22
|
from rich.progress import (
|
|
22
23
|
BarColumn,
|
|
@@ -37,7 +38,6 @@ from ..constants import (
|
|
|
37
38
|
LANGUAGE_DESCRIPTIONS,
|
|
38
39
|
SUPPORTED_OUTPUT_FORMATS,
|
|
39
40
|
VOICE_PREFIX_TO_LANG,
|
|
40
|
-
VOICES,
|
|
41
41
|
)
|
|
42
42
|
from ..conversion import (
|
|
43
43
|
Chapter,
|
|
@@ -54,6 +54,7 @@ from ..utils import (
|
|
|
54
54
|
load_config,
|
|
55
55
|
resolve_conversion_defaults,
|
|
56
56
|
)
|
|
57
|
+
from .commands_utility import _resolve_model_source_and_variant, _resolve_voice_names
|
|
57
58
|
from .helpers import DEFAULT_SAMPLE_TEXT, console, parse_voice_parameter
|
|
58
59
|
|
|
59
60
|
|
|
@@ -64,6 +65,14 @@ class ContentItem(TypedDict):
|
|
|
64
65
|
page_number: NotRequired[int]
|
|
65
66
|
|
|
66
67
|
|
|
68
|
+
def get_voices() -> list[str]:
|
|
69
|
+
"""Get the list of available voices."""
|
|
70
|
+
cfg = load_config()
|
|
71
|
+
|
|
72
|
+
model_source, model_variant = _resolve_model_source_and_variant(cfg)
|
|
73
|
+
return _resolve_voice_names(model_source, model_variant)
|
|
74
|
+
|
|
75
|
+
|
|
67
76
|
@click.command()
|
|
68
77
|
@click.argument("epub_file", type=click.Path(exists=True, path_type=Path))
|
|
69
78
|
@click.option(
|
|
@@ -82,7 +91,7 @@ class ContentItem(TypedDict):
|
|
|
82
91
|
@click.option(
|
|
83
92
|
"-v",
|
|
84
93
|
"--voice",
|
|
85
|
-
type=click.Choice(
|
|
94
|
+
type=click.Choice(get_voices()),
|
|
86
95
|
help="Voice to use for TTS.",
|
|
87
96
|
)
|
|
88
97
|
@click.option(
|
|
@@ -150,6 +159,12 @@ class ContentItem(TypedDict):
|
|
|
150
159
|
default=None,
|
|
151
160
|
help="Pause mode: 'tts', 'manual', or 'auto' (default: auto).",
|
|
152
161
|
)
|
|
162
|
+
@click.option(
|
|
163
|
+
"--enable-short-sentence/--disable-short-sentence",
|
|
164
|
+
"enable_short_sentence",
|
|
165
|
+
default=None,
|
|
166
|
+
help="Enable/disable special handling for short sentences.",
|
|
167
|
+
)
|
|
153
168
|
@click.option(
|
|
154
169
|
"--announce-chapters/--no-announce-chapters",
|
|
155
170
|
"announce_chapters",
|
|
@@ -296,6 +311,7 @@ def convert( # noqa: C901
|
|
|
296
311
|
pause_paragraph: float | None,
|
|
297
312
|
pause_variance: float | None,
|
|
298
313
|
pause_mode: str | None,
|
|
314
|
+
enable_short_sentence: bool | None,
|
|
299
315
|
announce_chapters: bool | None,
|
|
300
316
|
chapter_pause: float | None,
|
|
301
317
|
title: str | None,
|
|
@@ -325,6 +341,10 @@ def convert( # noqa: C901
|
|
|
325
341
|
config = load_config()
|
|
326
342
|
model_path = ctx.obj.get("model_path") if ctx.obj else None
|
|
327
343
|
voices_path = ctx.obj.get("voices_path") if ctx.obj else None
|
|
344
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
345
|
+
model_quality = cast(
|
|
346
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
347
|
+
)
|
|
328
348
|
|
|
329
349
|
# Get format first (needed for output path construction)
|
|
330
350
|
fmt = output_format or config.get("default_format", "m4b")
|
|
@@ -467,6 +487,9 @@ def convert( # noqa: C901
|
|
|
467
487
|
language=language or "a",
|
|
468
488
|
speed=speed or config.get("default_speed", 1.0),
|
|
469
489
|
use_gpu=use_gpu if use_gpu is not None else config.get("use_gpu", False),
|
|
490
|
+
model_source=model_source,
|
|
491
|
+
model_variant=model_variant,
|
|
492
|
+
model_quality=model_quality,
|
|
470
493
|
num_chapters=len(selected_indices) if selected_indices else len(epub_chapters),
|
|
471
494
|
title=effective_title,
|
|
472
495
|
author=effective_author,
|
|
@@ -510,6 +533,9 @@ def convert( # noqa: C901
|
|
|
510
533
|
output_format=output_format or config.get("default_format", "m4b"),
|
|
511
534
|
output_dir=output.parent,
|
|
512
535
|
use_gpu=use_gpu if use_gpu is not None else config.get("use_gpu", False),
|
|
536
|
+
model_quality=model_quality,
|
|
537
|
+
model_source=model_source,
|
|
538
|
+
model_variant=model_variant,
|
|
513
539
|
silence_between_chapters=silence or config.get("silence_between_chapters", 2.0),
|
|
514
540
|
lang=lang or config.get("phonemization_lang"),
|
|
515
541
|
use_mixed_language=(
|
|
@@ -556,6 +582,11 @@ def convert( # noqa: C901
|
|
|
556
582
|
pause_mode=(
|
|
557
583
|
pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
|
|
558
584
|
),
|
|
585
|
+
enable_short_sentence=(
|
|
586
|
+
enable_short_sentence
|
|
587
|
+
if enable_short_sentence is not None
|
|
588
|
+
else config.get("enable_short_sentence", None)
|
|
589
|
+
),
|
|
559
590
|
announce_chapters=(
|
|
560
591
|
announce_chapters
|
|
561
592
|
if announce_chapters is not None
|
|
@@ -947,6 +978,10 @@ def sample(
|
|
|
947
978
|
|
|
948
979
|
# Load config for defaults
|
|
949
980
|
user_config = load_config()
|
|
981
|
+
model_source, model_variant = _resolve_model_source_and_variant(user_config)
|
|
982
|
+
model_quality = cast(
|
|
983
|
+
ModelQuality, user_config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
984
|
+
)
|
|
950
985
|
resolved_defaults = resolve_conversion_defaults(
|
|
951
986
|
user_config,
|
|
952
987
|
{
|
|
@@ -980,6 +1015,9 @@ def sample(
|
|
|
980
1015
|
use_gpu=resolved_defaults["use_gpu"],
|
|
981
1016
|
split_mode=resolved_defaults["split_mode"],
|
|
982
1017
|
lang=resolved_defaults["lang"],
|
|
1018
|
+
model_quality=model_quality,
|
|
1019
|
+
model_source=model_source,
|
|
1020
|
+
model_variant=model_variant,
|
|
983
1021
|
use_mixed_language=(
|
|
984
1022
|
use_mixed_language or user_config.get("use_mixed_language", False)
|
|
985
1023
|
),
|
|
@@ -1117,6 +1155,9 @@ def _show_conversion_summary(
|
|
|
1117
1155
|
language: str,
|
|
1118
1156
|
speed: float,
|
|
1119
1157
|
use_gpu: bool,
|
|
1158
|
+
model_source: str,
|
|
1159
|
+
model_variant: str,
|
|
1160
|
+
model_quality: str | None,
|
|
1120
1161
|
num_chapters: int,
|
|
1121
1162
|
title: str,
|
|
1122
1163
|
author: str,
|
|
@@ -1139,6 +1180,9 @@ def _show_conversion_summary(
|
|
|
1139
1180
|
table.add_row("Chapters", str(num_chapters))
|
|
1140
1181
|
table.add_row("Voice", voice)
|
|
1141
1182
|
table.add_row("Language", LANGUAGE_DESCRIPTIONS.get(language, language))
|
|
1183
|
+
table.add_row("Model Source", model_source)
|
|
1184
|
+
table.add_row("Model Variant", model_variant)
|
|
1185
|
+
table.add_row("Model Quality", str(model_quality))
|
|
1142
1186
|
if lang:
|
|
1143
1187
|
table.add_row("Phonemization Lang", f"{lang} (override)")
|
|
1144
1188
|
if use_mixed_language:
|
|
@@ -1167,7 +1211,7 @@ def _show_conversion_summary(
|
|
|
1167
1211
|
@click.option(
|
|
1168
1212
|
"-v",
|
|
1169
1213
|
"--voice",
|
|
1170
|
-
type=click.Choice(
|
|
1214
|
+
type=click.Choice(get_voices()),
|
|
1171
1215
|
help="TTS voice to use.",
|
|
1172
1216
|
)
|
|
1173
1217
|
@click.option(
|
|
@@ -1271,6 +1315,11 @@ def _show_conversion_summary(
|
|
|
1271
1315
|
default=None,
|
|
1272
1316
|
help="Trim leading/trailing silence from audio.",
|
|
1273
1317
|
)
|
|
1318
|
+
@click.option(
|
|
1319
|
+
"--enable-short-sentence/--disable-short-sentence",
|
|
1320
|
+
default=None,
|
|
1321
|
+
help="Enable special handling for short sentences.",
|
|
1322
|
+
)
|
|
1274
1323
|
@click.pass_context
|
|
1275
1324
|
def read( # noqa: C901
|
|
1276
1325
|
ctx: click.Context,
|
|
@@ -1293,6 +1342,7 @@ def read( # noqa: C901
|
|
|
1293
1342
|
pause_paragraph: float | None,
|
|
1294
1343
|
pause_variance: float | None,
|
|
1295
1344
|
pause_mode: str | None,
|
|
1345
|
+
enable_short_sentence: bool | None,
|
|
1296
1346
|
) -> None:
|
|
1297
1347
|
"""Read an EPUB or text file aloud with streaming playback.
|
|
1298
1348
|
|
|
@@ -1340,6 +1390,10 @@ def read( # noqa: C901
|
|
|
1340
1390
|
|
|
1341
1391
|
# Load config for defaults
|
|
1342
1392
|
config = load_config()
|
|
1393
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
1394
|
+
model_quality = cast(
|
|
1395
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
1396
|
+
)
|
|
1343
1397
|
resolved_defaults = resolve_conversion_defaults(
|
|
1344
1398
|
config,
|
|
1345
1399
|
{
|
|
@@ -1389,6 +1443,11 @@ def read( # noqa: C901
|
|
|
1389
1443
|
effective_pause_mode = (
|
|
1390
1444
|
pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
|
|
1391
1445
|
)
|
|
1446
|
+
effective_enable_short_sentence = (
|
|
1447
|
+
enable_short_sentence
|
|
1448
|
+
if enable_short_sentence is not None
|
|
1449
|
+
else config.get("enable_short_sentence", None)
|
|
1450
|
+
)
|
|
1392
1451
|
|
|
1393
1452
|
# Get language code for TTS
|
|
1394
1453
|
espeak_lang = LANG_CODE_TO_ONNX.get(effective_language, "en-us")
|
|
@@ -1645,11 +1704,15 @@ def read( # noqa: C901
|
|
|
1645
1704
|
model_path=model_path,
|
|
1646
1705
|
voices_path=voices_path,
|
|
1647
1706
|
use_gpu=effective_use_gpu,
|
|
1707
|
+
model_quality=model_quality,
|
|
1708
|
+
model_source=model_source,
|
|
1709
|
+
model_variant=model_variant,
|
|
1648
1710
|
)
|
|
1649
1711
|
generation = GenerationConfig(
|
|
1650
1712
|
speed=effective_speed,
|
|
1651
1713
|
lang=espeak_lang,
|
|
1652
1714
|
pause_mode=cast(Literal["tts", "manual", "auto"], effective_pause_mode),
|
|
1715
|
+
enable_short_sentence=effective_enable_short_sentence,
|
|
1653
1716
|
pause_clause=effective_pause_clause,
|
|
1654
1717
|
pause_sentence=effective_pause_sentence,
|
|
1655
1718
|
pause_paragraph=effective_pause_paragraph,
|
|
@@ -1658,6 +1721,9 @@ def read( # noqa: C901
|
|
|
1658
1721
|
pipeline_config = PipelineConfig(
|
|
1659
1722
|
voice=effective_voice,
|
|
1660
1723
|
generation=generation,
|
|
1724
|
+
model_quality=model_quality,
|
|
1725
|
+
model_source=model_source,
|
|
1726
|
+
model_variant=model_variant,
|
|
1661
1727
|
model_path=model_path,
|
|
1662
1728
|
voices_path=voices_path,
|
|
1663
1729
|
)
|
|
@@ -10,9 +10,10 @@ This module contains commands for working with phonemes and pre-tokenized conten
|
|
|
10
10
|
import re
|
|
11
11
|
import sys
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any, cast
|
|
14
14
|
|
|
15
15
|
import click
|
|
16
|
+
from pykokoro.onnx_backend import DEFAULT_MODEL_QUALITY, ModelQuality
|
|
16
17
|
from rich.progress import (
|
|
17
18
|
BarColumn,
|
|
18
19
|
Progress,
|
|
@@ -37,6 +38,7 @@ from ..utils import (
|
|
|
37
38
|
format_filename_template,
|
|
38
39
|
load_config,
|
|
39
40
|
)
|
|
41
|
+
from .commands_utility import _resolve_model_source_and_variant
|
|
40
42
|
from .helpers import console, parse_voice_parameter
|
|
41
43
|
|
|
42
44
|
|
|
@@ -500,6 +502,10 @@ def phonemes_convert(
|
|
|
500
502
|
config = load_config()
|
|
501
503
|
model_path = ctx.obj.get("model_path") if ctx.obj else None
|
|
502
504
|
voices_path = ctx.obj.get("voices_path") if ctx.obj else None
|
|
505
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
506
|
+
model_quality = cast(
|
|
507
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
508
|
+
)
|
|
503
509
|
|
|
504
510
|
# Get book info and metadata
|
|
505
511
|
book_info = book.get_info()
|
|
@@ -599,6 +605,9 @@ def phonemes_convert(
|
|
|
599
605
|
speed=speed,
|
|
600
606
|
output_format=fmt,
|
|
601
607
|
use_gpu=gpu,
|
|
608
|
+
model_quality=model_quality,
|
|
609
|
+
model_source=model_source,
|
|
610
|
+
model_variant=model_variant,
|
|
602
611
|
silence_between_chapters=silence,
|
|
603
612
|
pause_clause=(
|
|
604
613
|
pause_clause
|
|
@@ -834,6 +843,12 @@ def phonemes_preview(
|
|
|
834
843
|
# Auto-detect if voice is a blend
|
|
835
844
|
parsed_voice, parsed_voice_blend = parse_voice_parameter(voice)
|
|
836
845
|
|
|
846
|
+
config = load_config()
|
|
847
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
848
|
+
model_quality = cast(
|
|
849
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
850
|
+
)
|
|
851
|
+
|
|
837
852
|
# Initialize converter
|
|
838
853
|
options = ConversionOptions(
|
|
839
854
|
phoneme_dictionary_path=str(phoneme_dict) if phoneme_dict else None,
|
|
@@ -841,6 +856,9 @@ def phonemes_preview(
|
|
|
841
856
|
voice_blend=parsed_voice_blend,
|
|
842
857
|
language=language,
|
|
843
858
|
output_format="wav", # Explicitly set WAV format
|
|
859
|
+
model_quality=model_quality,
|
|
860
|
+
model_source=model_source,
|
|
861
|
+
model_variant=model_variant,
|
|
844
862
|
)
|
|
845
863
|
converter = TTSConverter(options)
|
|
846
864
|
|
ttsforge/cli/commands_utility.py
CHANGED
|
@@ -555,6 +555,14 @@ def _resolve_model_source_and_variant(cfg: dict) -> tuple[ModelSource, ModelVari
|
|
|
555
555
|
return cast(ModelSource, source), cast(ModelVariant, variant)
|
|
556
556
|
|
|
557
557
|
|
|
558
|
+
def _resolve_voice_names(
|
|
559
|
+
model_source: ModelSource = "huggingface",
|
|
560
|
+
model_variant: ModelVariant = "v1.0",
|
|
561
|
+
) -> list[str]:
|
|
562
|
+
"""Return the list of voice names for the given model variant."""
|
|
563
|
+
return VOICE_NAMES_BY_VARIANT.get(model_variant, VOICE_NAMES)
|
|
564
|
+
|
|
565
|
+
|
|
558
566
|
def _get_cache_voices_path(
|
|
559
567
|
model_source: ModelSource,
|
|
560
568
|
model_variant: ModelVariant,
|
|
@@ -708,7 +716,7 @@ def download(ctx: click.Context, force: bool, quality: str | None) -> None:
|
|
|
708
716
|
|
|
709
717
|
# ---- voices
|
|
710
718
|
if model_source == "huggingface":
|
|
711
|
-
voice_names =
|
|
719
|
+
voice_names = _resolve_voice_names(model_source, model_variant)
|
|
712
720
|
total_voices = len(voice_names)
|
|
713
721
|
voices_task = progress.add_task(
|
|
714
722
|
f"Downloading voices (0/{total_voices})...", total=total_voices
|
|
@@ -1269,6 +1277,12 @@ def list_names( # noqa: C901
|
|
|
1269
1277
|
)
|
|
1270
1278
|
console.print("[dim]Type 'q' to quit, 's' to skip, 'r' to replay.[/dim]\n")
|
|
1271
1279
|
|
|
1280
|
+
cfg = load_config()
|
|
1281
|
+
model_source, model_variant = _resolve_model_source_and_variant(cfg)
|
|
1282
|
+
model_quality = cast(
|
|
1283
|
+
ModelQuality, cfg.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1272
1286
|
# Initialize converter with phoneme dictionary
|
|
1273
1287
|
try:
|
|
1274
1288
|
# Auto-detect if voice is a blend
|
|
@@ -1279,6 +1293,9 @@ def list_names( # noqa: C901
|
|
|
1279
1293
|
voice=parsed_voice or "af_sky",
|
|
1280
1294
|
voice_blend=parsed_voice_blend,
|
|
1281
1295
|
language=language,
|
|
1296
|
+
model_quality=model_quality,
|
|
1297
|
+
model_source=model_source,
|
|
1298
|
+
model_variant=model_variant,
|
|
1282
1299
|
)
|
|
1283
1300
|
converter = TTSConverter(options)
|
|
1284
1301
|
|
ttsforge/cli/helpers.py
CHANGED
|
@@ -50,6 +50,7 @@ DEFAULT_SAMPLE_TEXT = (
|
|
|
50
50
|
DEMO_TEXT = {
|
|
51
51
|
"a": "Hello! This audio was generated by {voice}. How do you like it?",
|
|
52
52
|
"b": "Hello! This audio was generated by {voice}. How do you like it?",
|
|
53
|
+
"d": "Hallo! Dieses Audio wurde von {voice} erzeugt. Wie gefallt es Ihnen?",
|
|
53
54
|
"e": "Hola! Este audio fue generado por {voice}. Que te parece?",
|
|
54
55
|
"f": "Bonjour! Cet audio a ete genere par {voice}. Comment le trouvez-vous?",
|
|
55
56
|
"h": "Namaste! Yah audio {voice} dwara banaya gaya hai. Aapko kaisa laga?",
|
ttsforge/constants.py
CHANGED
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
# from pykokoro.onnx_backend import VOICE_NAMES_V1_0
|
|
4
4
|
# from pykokoro.onnx_backend import VOICE_NAMES_V1_1_ZH, VOICE_NAMES_V1_1_DE
|
|
5
5
|
|
|
6
|
-
from pykokoro.onnx_backend import VOICE_NAMES_V1_0
|
|
6
|
+
from pykokoro.onnx_backend import DEFAULT_MODEL_SOURCE, VOICE_NAMES_V1_0
|
|
7
7
|
|
|
8
8
|
# Re-export from pykokoro for convenience
|
|
9
|
-
VOICES =
|
|
9
|
+
VOICES = VOICE_NAMES_V1_0
|
|
10
10
|
|
|
11
11
|
# Audio constants from pykokoro
|
|
12
12
|
try:
|
|
@@ -24,6 +24,7 @@ PROGRAM_DESCRIPTION = "Generate audiobooks from EPUB files using Kokoro ONNX TTS
|
|
|
24
24
|
LANGUAGE_DESCRIPTIONS = {
|
|
25
25
|
"a": "American English",
|
|
26
26
|
"b": "British English",
|
|
27
|
+
"d": "German",
|
|
27
28
|
"e": "Spanish",
|
|
28
29
|
"f": "French",
|
|
29
30
|
"h": "Hindi",
|
|
@@ -35,6 +36,8 @@ LANGUAGE_DESCRIPTIONS = {
|
|
|
35
36
|
|
|
36
37
|
# ISO language code to ttsforge language code mapping
|
|
37
38
|
ISO_TO_LANG_CODE = {
|
|
39
|
+
"de": "d",
|
|
40
|
+
"de-de": "d",
|
|
38
41
|
"en": "a", # Default to American English
|
|
39
42
|
"en-us": "a",
|
|
40
43
|
"en-gb": "b",
|
|
@@ -62,6 +65,8 @@ VOICE_PREFIX_TO_LANG = {
|
|
|
62
65
|
"am": "a", # American Male
|
|
63
66
|
"bf": "b", # British Female
|
|
64
67
|
"bm": "b", # British Male
|
|
68
|
+
"df": "d", # German Female
|
|
69
|
+
"dm": "d", # German Male
|
|
65
70
|
"ef": "e", # Spanish Female
|
|
66
71
|
"em": "e", # Spanish Male
|
|
67
72
|
"ff": "f", # French Female
|
|
@@ -82,6 +87,7 @@ VOICE_PREFIX_TO_LANG = {
|
|
|
82
87
|
DEFAULT_VOICE_FOR_LANG = {
|
|
83
88
|
"a": "af_heart",
|
|
84
89
|
"b": "bf_emma",
|
|
90
|
+
"d": "df_eva",
|
|
85
91
|
"e": "ef_dora",
|
|
86
92
|
"f": "ff_siwis",
|
|
87
93
|
"h": "hf_alpha",
|
|
@@ -115,6 +121,7 @@ DEFAULT_CONFIG = {
|
|
|
115
121
|
"use_gpu": False, # GPU requires onnxruntime-gpu
|
|
116
122
|
# Model quality: fp32, fp16, q8, q8f16, q4, q4f16, uint8, uint8f16
|
|
117
123
|
"model_quality": "fp32",
|
|
124
|
+
"model_source": DEFAULT_MODEL_SOURCE,
|
|
118
125
|
"model_variant": "v1.0",
|
|
119
126
|
"silence_between_chapters": 2.0,
|
|
120
127
|
"save_chapters_separately": False,
|
|
@@ -128,6 +135,7 @@ DEFAULT_CONFIG = {
|
|
|
128
135
|
"pause_paragraph": 0.9,
|
|
129
136
|
"pause_variance": 0.05,
|
|
130
137
|
"pause_mode": "auto", # "tts", "manual", or "auto
|
|
138
|
+
"enable_short_sentence": None,
|
|
131
139
|
# Language override for phonemization (e.g., 'de', 'fr', 'en-us')
|
|
132
140
|
# If None, language is determined from voice prefix
|
|
133
141
|
"phonemization_lang": None,
|
|
@@ -154,6 +162,7 @@ AUDIO_CHANNELS = 1
|
|
|
154
162
|
SAMPLE_TEXTS = {
|
|
155
163
|
"a": "This is a sample of the selected voice.",
|
|
156
164
|
"b": "This is a sample of the selected voice.",
|
|
165
|
+
"d": "Dies ist ein Beispiel für die ausgewählte Stimme.",
|
|
157
166
|
"e": "Este es una muestra de la voz seleccionada.",
|
|
158
167
|
"f": "Ceci est un exemple de la voix sélectionnée.",
|
|
159
168
|
"h": "यह चयनित आवाज़ का एक नमूना है।", # noqa: E501
|
ttsforge/conversion.py
CHANGED
|
@@ -11,6 +11,14 @@ from pathlib import Path
|
|
|
11
11
|
from typing import Any, Literal, Optional, cast
|
|
12
12
|
|
|
13
13
|
import soundfile as sf
|
|
14
|
+
from pykokoro.onnx_backend import (
|
|
15
|
+
DEFAULT_MODEL_QUALITY,
|
|
16
|
+
DEFAULT_MODEL_SOURCE,
|
|
17
|
+
DEFAULT_MODEL_VARIANT,
|
|
18
|
+
ModelQuality,
|
|
19
|
+
ModelSource,
|
|
20
|
+
ModelVariant,
|
|
21
|
+
)
|
|
14
22
|
|
|
15
23
|
from .audio_merge import AudioMerger, MergeMeta
|
|
16
24
|
from .constants import (
|
|
@@ -123,12 +131,16 @@ class ConversionState:
|
|
|
123
131
|
speed: float = 1.0
|
|
124
132
|
split_mode: str = "auto"
|
|
125
133
|
output_format: str = "m4b"
|
|
134
|
+
model_quality: ModelQuality | None = DEFAULT_MODEL_QUALITY
|
|
135
|
+
model_source: ModelSource = DEFAULT_MODEL_SOURCE
|
|
136
|
+
model_variant: ModelVariant = DEFAULT_MODEL_VARIANT
|
|
126
137
|
silence_between_chapters: float = 2.0
|
|
127
138
|
pause_clause: float = 0.3
|
|
128
139
|
pause_sentence: float = 0.5
|
|
129
140
|
pause_paragraph: float = 0.9
|
|
130
141
|
pause_variance: float = 0.05
|
|
131
142
|
pause_mode: str = "auto" # "tts", "manual", or "auto
|
|
143
|
+
enable_short_sentence: bool | None = None
|
|
132
144
|
lang: str | None = None # Language override for phonemization
|
|
133
145
|
chapters: list[ChapterState] = field(default_factory=list)
|
|
134
146
|
started_at: str = ""
|
|
@@ -183,8 +195,16 @@ class ConversionState:
|
|
|
183
195
|
data["pause_variance"] = 0.05
|
|
184
196
|
if "pause_mode" not in data:
|
|
185
197
|
data["pause_mode"] = "auto"
|
|
198
|
+
if "enable_short_sentence" not in data:
|
|
199
|
+
data["enable_short_sentence"] = None
|
|
186
200
|
if "lang" not in data:
|
|
187
201
|
data["lang"] = None
|
|
202
|
+
if "model_quality" not in data:
|
|
203
|
+
data["model_quality"] = DEFAULT_MODEL_QUALITY
|
|
204
|
+
if "model_source" not in data:
|
|
205
|
+
data["model_source"] = DEFAULT_MODEL_SOURCE
|
|
206
|
+
if "model_variant" not in data:
|
|
207
|
+
data["model_variant"] = DEFAULT_MODEL_VARIANT
|
|
188
208
|
|
|
189
209
|
return cls(**data)
|
|
190
210
|
except (json.JSONDecodeError, TypeError, KeyError):
|
|
@@ -204,12 +224,16 @@ class ConversionState:
|
|
|
204
224
|
"speed": self.speed,
|
|
205
225
|
"split_mode": self.split_mode,
|
|
206
226
|
"output_format": self.output_format,
|
|
227
|
+
"model_quality": self.model_quality,
|
|
228
|
+
"model_source": self.model_source,
|
|
229
|
+
"model_variant": self.model_variant,
|
|
207
230
|
"silence_between_chapters": self.silence_between_chapters,
|
|
208
231
|
"pause_clause": self.pause_clause,
|
|
209
232
|
"pause_sentence": self.pause_sentence,
|
|
210
233
|
"pause_paragraph": self.pause_paragraph,
|
|
211
234
|
"pause_variance": self.pause_variance,
|
|
212
235
|
"pause_mode": self.pause_mode,
|
|
236
|
+
"enable_short_sentence": self.enable_short_sentence,
|
|
213
237
|
"lang": self.lang,
|
|
214
238
|
"chapters": [
|
|
215
239
|
{
|
|
@@ -294,6 +318,7 @@ class ConversionOptions:
|
|
|
294
318
|
pause_paragraph: float = 0.9 # For paragraph boundaries
|
|
295
319
|
pause_variance: float = 0.05 # Standard deviation for natural variation
|
|
296
320
|
pause_mode: str = "auto" # "tts", "manual", or "auto
|
|
321
|
+
enable_short_sentence: bool | None = None # Enable short sentence handling
|
|
297
322
|
# Chapter announcement settings
|
|
298
323
|
announce_chapters: bool = True # Read chapter titles aloud before content
|
|
299
324
|
chapter_pause_after_title: float = 2.0 # Pause after chapter title (seconds)
|
|
@@ -315,6 +340,9 @@ class ConversionOptions:
|
|
|
315
340
|
# Filename template for chapter files
|
|
316
341
|
chapter_filename_template: str = "{chapter_num:03d}_{book_title}_{chapter_title}"
|
|
317
342
|
# Custom ONNX model path (None = use default downloaded model)
|
|
343
|
+
model_quality: ModelQuality | None = DEFAULT_MODEL_QUALITY
|
|
344
|
+
model_source: ModelSource = DEFAULT_MODEL_SOURCE
|
|
345
|
+
model_variant: ModelVariant = DEFAULT_MODEL_VARIANT
|
|
318
346
|
model_path: Path | None = None
|
|
319
347
|
# Custom voices.bin path (None = use default downloaded voices)
|
|
320
348
|
voices_path: Path | None = None
|
|
@@ -420,6 +448,9 @@ class TTSConverter:
|
|
|
420
448
|
pause_sentence=self.options.pause_sentence,
|
|
421
449
|
pause_paragraph=self.options.pause_paragraph,
|
|
422
450
|
pause_variance=self.options.pause_variance,
|
|
451
|
+
model_quality=self.options.model_quality,
|
|
452
|
+
model_source=self.options.model_source,
|
|
453
|
+
model_variant=self.options.model_variant,
|
|
423
454
|
model_path=self.options.model_path,
|
|
424
455
|
voices_path=self.options.voices_path,
|
|
425
456
|
voice_blend=self.options.voice_blend,
|
|
@@ -600,53 +631,79 @@ class TTSConverter:
|
|
|
600
631
|
)
|
|
601
632
|
state = None
|
|
602
633
|
else:
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
state.
|
|
606
|
-
or state.
|
|
607
|
-
or state.speed != self.options.speed
|
|
608
|
-
or state.split_mode != self.options.split_mode
|
|
609
|
-
or state.silence_between_chapters
|
|
610
|
-
!= self.options.silence_between_chapters
|
|
611
|
-
or state.pause_clause != self.options.pause_clause
|
|
612
|
-
or state.pause_sentence != self.options.pause_sentence
|
|
613
|
-
or state.pause_paragraph != self.options.pause_paragraph
|
|
614
|
-
or state.pause_variance != self.options.pause_variance
|
|
615
|
-
or state.pause_mode != self.options.pause_mode
|
|
616
|
-
or state.lang != self.options.lang
|
|
634
|
+
model_settings_changed = (
|
|
635
|
+
state.model_quality != self.options.model_quality
|
|
636
|
+
or state.model_source != self.options.model_source
|
|
637
|
+
or state.model_variant != self.options.model_variant
|
|
617
638
|
)
|
|
618
639
|
|
|
619
|
-
if
|
|
640
|
+
if model_settings_changed:
|
|
620
641
|
self.log(
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
642
|
+
"Model settings changed, starting fresh conversion",
|
|
643
|
+
"warning",
|
|
644
|
+
)
|
|
645
|
+
state = None
|
|
646
|
+
else:
|
|
647
|
+
# Check if settings differ from saved state
|
|
648
|
+
settings_changed = (
|
|
649
|
+
state.voice != self.options.voice
|
|
650
|
+
or state.language != self.options.language
|
|
651
|
+
or state.speed != self.options.speed
|
|
652
|
+
or state.split_mode != self.options.split_mode
|
|
653
|
+
or state.silence_between_chapters
|
|
654
|
+
!= self.options.silence_between_chapters
|
|
655
|
+
or state.pause_clause != self.options.pause_clause
|
|
656
|
+
or state.pause_sentence != self.options.pause_sentence
|
|
657
|
+
or state.pause_paragraph != self.options.pause_paragraph
|
|
658
|
+
or state.pause_variance != self.options.pause_variance
|
|
659
|
+
or state.pause_mode != self.options.pause_mode
|
|
660
|
+
or state.enable_short_sentence
|
|
661
|
+
!= self.options.enable_short_sentence
|
|
662
|
+
or state.lang != self.options.lang
|
|
633
663
|
)
|
|
634
664
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
665
|
+
if settings_changed:
|
|
666
|
+
self.log(
|
|
667
|
+
f"Restoring settings from previous session: "
|
|
668
|
+
f"voice={state.voice}, language={state.language}, "
|
|
669
|
+
f"lang_override={state.lang}, "
|
|
670
|
+
f"speed={state.speed}, "
|
|
671
|
+
f"split_mode={state.split_mode}, "
|
|
672
|
+
f"silence={state.silence_between_chapters}s, "
|
|
673
|
+
f"pauses: clause={state.pause_clause}s "
|
|
674
|
+
f"sent={state.pause_sentence}s "
|
|
675
|
+
f"para={state.pause_paragraph}s "
|
|
676
|
+
f"var={state.pause_variance}s "
|
|
677
|
+
f"pause_mode={state.pause_mode}, "
|
|
678
|
+
f"enable_short_sentence="
|
|
679
|
+
f"{state.enable_short_sentence}, "
|
|
680
|
+
f"model_source={state.model_source}, "
|
|
681
|
+
f"model_variant={state.model_variant}, "
|
|
682
|
+
f"model_quality={state.model_quality}",
|
|
683
|
+
"info",
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
# Apply saved settings to options for consistency
|
|
687
|
+
self.options.voice = state.voice
|
|
688
|
+
self.options.language = state.language
|
|
689
|
+
self.options.speed = state.speed
|
|
690
|
+
self.options.split_mode = state.split_mode
|
|
691
|
+
self.options.output_format = state.output_format
|
|
692
|
+
self.options.silence_between_chapters = (
|
|
693
|
+
state.silence_between_chapters
|
|
694
|
+
)
|
|
695
|
+
self.options.pause_clause = state.pause_clause
|
|
696
|
+
self.options.pause_sentence = state.pause_sentence
|
|
697
|
+
self.options.pause_paragraph = state.pause_paragraph
|
|
698
|
+
self.options.pause_variance = state.pause_variance
|
|
699
|
+
self.options.pause_mode = state.pause_mode
|
|
700
|
+
self.options.enable_short_sentence = (
|
|
701
|
+
state.enable_short_sentence
|
|
702
|
+
)
|
|
703
|
+
self.options.lang = state.lang
|
|
704
|
+
self.options.model_quality = state.model_quality
|
|
705
|
+
self.options.model_source = state.model_source
|
|
706
|
+
self.options.model_variant = state.model_variant
|
|
650
707
|
|
|
651
708
|
if state is None:
|
|
652
709
|
# Create new state
|
|
@@ -661,12 +718,16 @@ class TTSConverter:
|
|
|
661
718
|
speed=self.options.speed,
|
|
662
719
|
split_mode=self.options.split_mode,
|
|
663
720
|
output_format=self.options.output_format,
|
|
721
|
+
model_quality=self.options.model_quality,
|
|
722
|
+
model_source=self.options.model_source,
|
|
723
|
+
model_variant=self.options.model_variant,
|
|
664
724
|
silence_between_chapters=self.options.silence_between_chapters,
|
|
665
725
|
pause_clause=self.options.pause_clause,
|
|
666
726
|
pause_sentence=self.options.pause_sentence,
|
|
667
727
|
pause_paragraph=self.options.pause_paragraph,
|
|
668
728
|
pause_variance=self.options.pause_variance,
|
|
669
729
|
pause_mode=self.options.pause_mode,
|
|
730
|
+
enable_short_sentence=self.options.enable_short_sentence,
|
|
670
731
|
lang=self.options.lang,
|
|
671
732
|
chapters=[
|
|
672
733
|
ChapterState(
|
ttsforge/kokoro_runner.py
CHANGED
|
@@ -7,11 +7,19 @@ from typing import Any, Literal, Protocol, cast
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
from pykokoro import GenerationConfig, KokoroPipeline, PipelineConfig
|
|
9
9
|
from pykokoro.onnx_backend import (
|
|
10
|
+
DEFAULT_MODEL_QUALITY,
|
|
11
|
+
DEFAULT_MODEL_SOURCE,
|
|
12
|
+
DEFAULT_MODEL_VARIANT,
|
|
10
13
|
Kokoro,
|
|
14
|
+
ModelQuality,
|
|
15
|
+
ModelSource,
|
|
16
|
+
ModelVariant,
|
|
11
17
|
VoiceBlend,
|
|
12
18
|
are_models_downloaded,
|
|
13
19
|
download_all_models,
|
|
20
|
+
download_all_models_github,
|
|
14
21
|
)
|
|
22
|
+
from pykokoro.pipeline import build_pipeline
|
|
15
23
|
from pykokoro.stages.audio_generation.onnx import OnnxAudioGenerationAdapter
|
|
16
24
|
from pykokoro.stages.audio_postprocessing.onnx import OnnxAudioPostprocessingAdapter
|
|
17
25
|
from pykokoro.stages.phoneme_processing.onnx import OnnxPhonemeProcessorAdapter
|
|
@@ -26,6 +34,9 @@ class KokoroRunOptions:
|
|
|
26
34
|
pause_sentence: float
|
|
27
35
|
pause_paragraph: float
|
|
28
36
|
pause_variance: float
|
|
37
|
+
model_quality: ModelQuality | None = DEFAULT_MODEL_QUALITY
|
|
38
|
+
model_source: ModelSource = DEFAULT_MODEL_SOURCE
|
|
39
|
+
model_variant: ModelVariant = DEFAULT_MODEL_VARIANT
|
|
29
40
|
model_path: Any | None = None
|
|
30
41
|
voices_path: Any | None = None
|
|
31
42
|
voice_blend: str | None = None
|
|
@@ -48,15 +59,32 @@ class KokoroRunner:
|
|
|
48
59
|
if self._pipeline is not None:
|
|
49
60
|
return
|
|
50
61
|
|
|
51
|
-
if
|
|
52
|
-
self.
|
|
53
|
-
|
|
62
|
+
if self.opts.model_path is None or self.opts.voices_path is None:
|
|
63
|
+
model_quality = self.opts.model_quality or DEFAULT_MODEL_QUALITY
|
|
64
|
+
model_source = self.opts.model_source or DEFAULT_MODEL_SOURCE
|
|
65
|
+
if model_source == "github":
|
|
66
|
+
if not are_models_downloaded(quality=model_quality):
|
|
67
|
+
self.log("Downloading ONNX model files from GitHub...")
|
|
68
|
+
download_all_models_github(
|
|
69
|
+
variant=self.opts.model_variant,
|
|
70
|
+
quality=model_quality,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
if not are_models_downloaded(quality=model_quality):
|
|
74
|
+
self.log("Downloading ONNX model files...")
|
|
75
|
+
download_all_models(
|
|
76
|
+
variant=self.opts.model_variant,
|
|
77
|
+
quality=model_quality,
|
|
78
|
+
)
|
|
54
79
|
|
|
55
80
|
self._kokoro = Kokoro(
|
|
56
81
|
model_path=self.opts.model_path,
|
|
57
82
|
voices_path=self.opts.voices_path,
|
|
58
83
|
use_gpu=self.opts.use_gpu,
|
|
59
84
|
tokenizer_config=self.opts.tokenizer_config,
|
|
85
|
+
model_quality=self.opts.model_quality,
|
|
86
|
+
model_source=self.opts.model_source,
|
|
87
|
+
model_variant=self.opts.model_variant,
|
|
60
88
|
)
|
|
61
89
|
|
|
62
90
|
assert self._kokoro is not None
|
|
@@ -88,14 +116,18 @@ class KokoroRunner:
|
|
|
88
116
|
pipeline_cfg = PipelineConfig(
|
|
89
117
|
voice=self._voice_style,
|
|
90
118
|
generation=GenerationConfig(speed=self.opts.speed, lang="en-us"),
|
|
119
|
+
model_quality=self.opts.model_quality,
|
|
120
|
+
model_source=self.opts.model_source,
|
|
121
|
+
model_variant=self.opts.model_variant,
|
|
91
122
|
model_path=self.opts.model_path,
|
|
92
123
|
voices_path=self.opts.voices_path,
|
|
93
124
|
tokenizer_config=self.opts.tokenizer_config,
|
|
94
125
|
)
|
|
95
126
|
|
|
96
127
|
# Use the same adapters everywhere (text + phonemes)
|
|
97
|
-
self._pipeline =
|
|
98
|
-
pipeline_cfg,
|
|
128
|
+
self._pipeline = build_pipeline(
|
|
129
|
+
config=pipeline_cfg,
|
|
130
|
+
backend=self._kokoro,
|
|
99
131
|
phoneme_processing=OnnxPhonemeProcessorAdapter(self._kokoro),
|
|
100
132
|
audio_generation=OnnxAudioGenerationAdapter(self._kokoro),
|
|
101
133
|
audio_postprocessing=OnnxAudioPostprocessingAdapter(self._kokoro),
|
|
@@ -116,6 +148,7 @@ class KokoroRunner:
|
|
|
116
148
|
lang=lang_code,
|
|
117
149
|
is_phonemes=is_phonemes,
|
|
118
150
|
pause_mode=pause_mode,
|
|
151
|
+
enable_short_sentence=self.opts.enable_short_sentence,
|
|
119
152
|
pause_clause=self.opts.pause_clause,
|
|
120
153
|
pause_sentence=self.opts.pause_sentence,
|
|
121
154
|
pause_paragraph=self.opts.pause_paragraph,
|
ttsforge/phoneme_conversion.py
CHANGED
|
@@ -15,6 +15,14 @@ from typing import Any, Literal, Optional, cast
|
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
import soundfile as sf
|
|
18
|
+
from pykokoro.onnx_backend import (
|
|
19
|
+
DEFAULT_MODEL_QUALITY,
|
|
20
|
+
DEFAULT_MODEL_SOURCE,
|
|
21
|
+
DEFAULT_MODEL_VARIANT,
|
|
22
|
+
ModelQuality,
|
|
23
|
+
ModelSource,
|
|
24
|
+
ModelVariant,
|
|
25
|
+
)
|
|
18
26
|
|
|
19
27
|
from .audio_merge import AudioMerger, MergeMeta
|
|
20
28
|
from .chapter_selection import parse_chapter_selection
|
|
@@ -94,12 +102,16 @@ class PhonemeConversionState:
|
|
|
94
102
|
voice: str = ""
|
|
95
103
|
speed: float = 1.0
|
|
96
104
|
output_format: str = "m4b"
|
|
105
|
+
model_quality: ModelQuality | None = DEFAULT_MODEL_QUALITY
|
|
106
|
+
model_source: ModelSource = DEFAULT_MODEL_SOURCE
|
|
107
|
+
model_variant: ModelVariant = DEFAULT_MODEL_VARIANT
|
|
97
108
|
silence_between_chapters: float = 2.0
|
|
98
109
|
pause_clause: float = 0.3
|
|
99
110
|
pause_sentence: float = 0.5
|
|
100
111
|
pause_paragraph: float = 0.9
|
|
101
112
|
pause_variance: float = 0.05
|
|
102
113
|
pause_mode: str = "auto"
|
|
114
|
+
enable_short_sentence: bool | None = None
|
|
103
115
|
lang: str | None = None # Language override for phonemization
|
|
104
116
|
chapters: list[PhonemeChapterState] = field(default_factory=list)
|
|
105
117
|
started_at: str = ""
|
|
@@ -154,8 +166,16 @@ class PhonemeConversionState:
|
|
|
154
166
|
data["pause_variance"] = 0.05
|
|
155
167
|
if "pause_mode" not in data:
|
|
156
168
|
data["pause_mode"] = "auto"
|
|
169
|
+
if "enable_short_sentence" not in data:
|
|
170
|
+
data["enable_short_sentence"] = None
|
|
157
171
|
if "lang" not in data:
|
|
158
172
|
data["lang"] = None
|
|
173
|
+
if "model_quality" not in data:
|
|
174
|
+
data["model_quality"] = DEFAULT_MODEL_QUALITY
|
|
175
|
+
if "model_source" not in data:
|
|
176
|
+
data["model_source"] = DEFAULT_MODEL_SOURCE
|
|
177
|
+
if "model_variant" not in data:
|
|
178
|
+
data["model_variant"] = DEFAULT_MODEL_VARIANT
|
|
159
179
|
|
|
160
180
|
return cls(**data)
|
|
161
181
|
except (json.JSONDecodeError, TypeError, KeyError):
|
|
@@ -172,12 +192,16 @@ class PhonemeConversionState:
|
|
|
172
192
|
"voice": self.voice,
|
|
173
193
|
"speed": self.speed,
|
|
174
194
|
"output_format": self.output_format,
|
|
195
|
+
"model_quality": self.model_quality,
|
|
196
|
+
"model_source": self.model_source,
|
|
197
|
+
"model_variant": self.model_variant,
|
|
175
198
|
"silence_between_chapters": self.silence_between_chapters,
|
|
176
199
|
"pause_clause": self.pause_clause,
|
|
177
200
|
"pause_sentence": self.pause_sentence,
|
|
178
201
|
"pause_paragraph": self.pause_paragraph,
|
|
179
202
|
"pause_variance": self.pause_variance,
|
|
180
203
|
"pause_mode": self.pause_mode,
|
|
204
|
+
"enable_short_sentence": self.enable_short_sentence,
|
|
181
205
|
"lang": self.lang,
|
|
182
206
|
"chapters": [
|
|
183
207
|
{
|
|
@@ -215,6 +239,7 @@ class PhonemeConversionOptions:
|
|
|
215
239
|
pause_paragraph: float = 0.9 # For paragraph boundaries
|
|
216
240
|
pause_variance: float = 0.05 # Standard deviation for natural variation
|
|
217
241
|
pause_mode: str = "auto" # "tts", "manual", or "auto"
|
|
242
|
+
enable_short_sentence: bool | None = None # Enable short sentence handling
|
|
218
243
|
# Chapter announcement settings
|
|
219
244
|
announce_chapters: bool = True # Read chapter titles aloud before content
|
|
220
245
|
chapter_pause_after_title: float = 2.0 # Pause after chapter title (seconds)
|
|
@@ -235,6 +260,9 @@ class PhonemeConversionOptions:
|
|
|
235
260
|
# Filename template for chapter files
|
|
236
261
|
chapter_filename_template: str = "{chapter_num:03d}_{book_title}_{chapter_title}"
|
|
237
262
|
# Custom ONNX model path (None = use default downloaded model)
|
|
263
|
+
model_quality: ModelQuality | None = DEFAULT_MODEL_QUALITY
|
|
264
|
+
model_source: ModelSource = DEFAULT_MODEL_SOURCE
|
|
265
|
+
model_variant: ModelVariant = DEFAULT_MODEL_VARIANT
|
|
238
266
|
model_path: Path | None = None
|
|
239
267
|
# Custom voices.bin path (None = use default downloaded voices)
|
|
240
268
|
voices_path: Path | None = None
|
|
@@ -583,6 +611,11 @@ class PhonemeConverter:
|
|
|
583
611
|
or state.pause_paragraph != self.options.pause_paragraph
|
|
584
612
|
or state.pause_variance != self.options.pause_variance
|
|
585
613
|
or state.pause_mode != self.options.pause_mode
|
|
614
|
+
or state.enable_short_sentence
|
|
615
|
+
!= self.options.enable_short_sentence
|
|
616
|
+
or state.model_quality != self.options.model_quality
|
|
617
|
+
or state.model_source != self.options.model_source
|
|
618
|
+
or state.model_variant != self.options.model_variant
|
|
586
619
|
):
|
|
587
620
|
self.log(
|
|
588
621
|
f"Restoring settings from previous session: "
|
|
@@ -592,7 +625,11 @@ class PhonemeConverter:
|
|
|
592
625
|
f"pause_sentence={state.pause_sentence}s, "
|
|
593
626
|
f"pause_paragraph={state.pause_paragraph}s, "
|
|
594
627
|
f"pause_variance={state.pause_variance}s, "
|
|
595
|
-
f"pause_mode={state.pause_mode}"
|
|
628
|
+
f"pause_mode={state.pause_mode}, "
|
|
629
|
+
f"enable_short_sentence={state.enable_short_sentence}, "
|
|
630
|
+
f"model_source={state.model_source}, "
|
|
631
|
+
f"model_variant={state.model_variant}, "
|
|
632
|
+
f"model_quality={state.model_quality}",
|
|
596
633
|
"info",
|
|
597
634
|
)
|
|
598
635
|
# Apply saved settings for consistency
|
|
@@ -607,6 +644,10 @@ class PhonemeConverter:
|
|
|
607
644
|
self.options.pause_paragraph = state.pause_paragraph
|
|
608
645
|
self.options.pause_variance = state.pause_variance
|
|
609
646
|
self.options.pause_mode = state.pause_mode
|
|
647
|
+
self.options.enable_short_sentence = state.enable_short_sentence
|
|
648
|
+
self.options.model_quality = state.model_quality
|
|
649
|
+
self.options.model_source = state.model_source
|
|
650
|
+
self.options.model_variant = state.model_variant
|
|
610
651
|
|
|
611
652
|
if state is None:
|
|
612
653
|
# Create new state
|
|
@@ -617,12 +658,16 @@ class PhonemeConverter:
|
|
|
617
658
|
voice=self.options.voice,
|
|
618
659
|
speed=self.options.speed,
|
|
619
660
|
output_format=self.options.output_format,
|
|
661
|
+
model_quality=self.options.model_quality,
|
|
662
|
+
model_source=self.options.model_source,
|
|
663
|
+
model_variant=self.options.model_variant,
|
|
620
664
|
silence_between_chapters=self.options.silence_between_chapters,
|
|
621
665
|
pause_clause=self.options.pause_clause,
|
|
622
666
|
pause_sentence=self.options.pause_sentence,
|
|
623
667
|
pause_paragraph=self.options.pause_paragraph,
|
|
624
668
|
pause_variance=self.options.pause_variance,
|
|
625
669
|
pause_mode=self.options.pause_mode,
|
|
670
|
+
enable_short_sentence=self.options.enable_short_sentence,
|
|
626
671
|
chapters=[
|
|
627
672
|
PhonemeChapterState(
|
|
628
673
|
index=idx,
|
|
@@ -648,6 +693,9 @@ class PhonemeConverter:
|
|
|
648
693
|
pause_sentence=self.options.pause_sentence,
|
|
649
694
|
pause_paragraph=self.options.pause_paragraph,
|
|
650
695
|
pause_variance=self.options.pause_variance,
|
|
696
|
+
model_quality=self.options.model_quality,
|
|
697
|
+
model_source=self.options.model_source,
|
|
698
|
+
model_variant=self.options.model_variant,
|
|
651
699
|
model_path=self.options.model_path,
|
|
652
700
|
voices_path=self.options.voices_path,
|
|
653
701
|
voice_blend=self.options.voice_blend,
|
|
@@ -848,6 +896,9 @@ class PhonemeConverter:
|
|
|
848
896
|
pause_sentence=self.options.pause_sentence,
|
|
849
897
|
pause_paragraph=self.options.pause_paragraph,
|
|
850
898
|
pause_variance=self.options.pause_variance,
|
|
899
|
+
model_quality=self.options.model_quality,
|
|
900
|
+
model_source=self.options.model_source,
|
|
901
|
+
model_variant=self.options.model_variant,
|
|
851
902
|
model_path=self.options.model_path,
|
|
852
903
|
voices_path=self.options.voices_path,
|
|
853
904
|
voice_blend=self.options.voice_blend,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ttsforge
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Generate audiobooks from EPUB files using Kokoro ONNX TTS.
|
|
5
5
|
Author-email: Holger Nahrstaedt <nahrstaedt@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -617,6 +617,7 @@ ttsforge convert book.epub --gpu
|
|
|
617
617
|
| `pause_paragraph` | `0.9` | Paragraph pause (seconds) |
|
|
618
618
|
| `pause_variance` | `0.05` | Pause variance (seconds) |
|
|
619
619
|
| `pause_mode` | `auto` | Pause mode (`tts`, `manual`, `auto`) |
|
|
620
|
+
| `enable_short_sentence` | `None` | Handle short sentences |
|
|
620
621
|
| `announce_chapters` | `true` | Speak chapter titles |
|
|
621
622
|
| `chapter_pause_after_title` | `2.0` | Pause after chapter title |
|
|
622
623
|
| `phonemization_lang` | `None` | Override phonemization language |
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
ttsforge/__init__.py,sha256=5mT7eXwuq0Z-Qn_WjYWVjA8VUOUy0lGTyaqQwNdcEOE,2149
|
|
2
|
+
ttsforge/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
|
|
3
|
+
ttsforge/audio_merge.py,sha256=Tt7o8GBNrkcfiSKycUpWvblj-y4zwlULoX-eCblqYpo,5666
|
|
4
|
+
ttsforge/audio_player.py,sha256=HYc4vv46yDXjVXaWRlj1tUtWLiwNTwbzT6oDfOUB5vA,14351
|
|
5
|
+
ttsforge/chapter_selection.py,sha256=a-XlEO4HMzeUBfhvnh6gQOQmDuM0wMpVCi0pw6oM2hQ,2579
|
|
6
|
+
ttsforge/constants.py,sha256=c3b9s41mNIb8NHK6C7XKB-xzX4wJhFZpo0auUqCwT2s,5394
|
|
7
|
+
ttsforge/conversion.py,sha256=2ZzeBfg-Kwf7kR9Fwht30Vx9ZcoAEQOVAiFsmfF2IdA,45011
|
|
8
|
+
ttsforge/input_reader.py,sha256=b49SBT-mL4SnR74D8xwyWHC_smPhsJ5jpPAj4QQ5WKo,14068
|
|
9
|
+
ttsforge/kokoro_lang.py,sha256=8603b5whfk0KzGrNK7pqRjzoH1Ge9TKoX7AMzKsX0sk,376
|
|
10
|
+
ttsforge/kokoro_runner.py,sha256=AKvEMaBfCTCLR3KcHoE04nQnUkHDwv62BvQNb1vGC8U,5923
|
|
11
|
+
ttsforge/name_extractor.py,sha256=CxxBadCO0Pcoepcj7gZwkfWPMud2oa9477h_lDYWrIA,9578
|
|
12
|
+
ttsforge/phoneme_conversion.py,sha256=nbDV0adWi--XyRt2RblJbav_ImzDeXrC0xPvoPx8_9c,41093
|
|
13
|
+
ttsforge/phonemes.py,sha256=EUZ1Qr-0rPThRpSeuJQe5Z3J3nz7rX1Xs3Rjjw19qIQ,15517
|
|
14
|
+
ttsforge/ssmd_generator.py,sha256=LknVBSETKH9cY4CoAUBjd1vEfr5VXi0xqEKcft2CR8I,13346
|
|
15
|
+
ttsforge/utils.py,sha256=3BiNFyScV3Dy_xhVm2EigpxUb4Z6YwIQPzzxwDzfCzI,24942
|
|
16
|
+
ttsforge/cli/__init__.py,sha256=CTqYeUAJaKV7YTYqcmr7-VxjwJfjLcnPYM2OKyws0Oc,2103
|
|
17
|
+
ttsforge/cli/commands_conversion.py,sha256=T4hPiU4EXDQ2Wkbd5I5TuHvuAT8rlfEjhfC4mlSMrzg,66007
|
|
18
|
+
ttsforge/cli/commands_phonemes.py,sha256=k3CtXKnUTpbGHj1oPucjIB6syA3LWjxFpg3OIC-tzJ8,33183
|
|
19
|
+
ttsforge/cli/commands_utility.py,sha256=65NSHUFYjRPOWVLnpeBKbR-TbsaDLYAa5xDGcWHS-fk,48630
|
|
20
|
+
ttsforge/cli/helpers.py,sha256=IJt0VpIMPOC-lnBeR3-1keh31MuAdSemDsLh6FpiHLk,2778
|
|
21
|
+
ttsforge/vocab/__init__.py,sha256=lMgS0dY9VbOYI20LnPjjqrWcjLIQ1FKkR4-xcXsvrqc,3641
|
|
22
|
+
ttsforge-0.1.2.dist-info/licenses/LICENSE,sha256=9csb1sDNn0HdUPKgOTUwtb4CkvYPcFXHnkxKCS99EWQ,1074
|
|
23
|
+
ttsforge-0.1.2.dist-info/METADATA,sha256=T0xJ8RtNsBidb4sa_JYR-QnugMK-fLLyVg71R-zi0QU,19655
|
|
24
|
+
ttsforge-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
25
|
+
ttsforge-0.1.2.dist-info/entry_points.txt,sha256=SrcNdlhQpoUCzPzhVbOmMzATQeV7j7XYl0DPrVjZ-ks,47
|
|
26
|
+
ttsforge-0.1.2.dist-info/top_level.txt,sha256=rNLi-3muicHF8UvZu_FuA2ML_Dz9sVPCjik2E8XnCVk,9
|
|
27
|
+
ttsforge-0.1.2.dist-info/RECORD,,
|
ttsforge-0.1.1.dist-info/RECORD
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
ttsforge/__init__.py,sha256=Jg8_0vPttTVWrnt4HBqrTOKYfmcgpVpfddSVcU4HKXo,2432
|
|
2
|
-
ttsforge/_version.py,sha256=m8HxkqoKGw_wAJtc4ZokpJKNLXqp4zwnNhbnfDtro7w,704
|
|
3
|
-
ttsforge/audio_merge.py,sha256=Tt7o8GBNrkcfiSKycUpWvblj-y4zwlULoX-eCblqYpo,5666
|
|
4
|
-
ttsforge/audio_player.py,sha256=HYc4vv46yDXjVXaWRlj1tUtWLiwNTwbzT6oDfOUB5vA,14351
|
|
5
|
-
ttsforge/chapter_selection.py,sha256=a-XlEO4HMzeUBfhvnh6gQOQmDuM0wMpVCi0pw6oM2hQ,2579
|
|
6
|
-
ttsforge/constants.py,sha256=Y_u8BanHjZBGiAnc9FVPulVHHGIRYlOJhq74BTtxPYA,5108
|
|
7
|
-
ttsforge/conversion.py,sha256=gz6ePEHjC7Nu1zlPArdsIOZA2rb_EEfJMaV6lA-0V2o,41623
|
|
8
|
-
ttsforge/input_reader.py,sha256=b49SBT-mL4SnR74D8xwyWHC_smPhsJ5jpPAj4QQ5WKo,14068
|
|
9
|
-
ttsforge/kokoro_lang.py,sha256=8603b5whfk0KzGrNK7pqRjzoH1Ge9TKoX7AMzKsX0sk,376
|
|
10
|
-
ttsforge/kokoro_runner.py,sha256=ZGBx70_rHcfwKiUgywa_3-7d5u-wQ_0pPOukQRuACu0,4390
|
|
11
|
-
ttsforge/name_extractor.py,sha256=CxxBadCO0Pcoepcj7gZwkfWPMud2oa9477h_lDYWrIA,9578
|
|
12
|
-
ttsforge/phoneme_conversion.py,sha256=nLnFk1uuMMD1Mw4a24zZeZHDHG_4mG3kW1xiOcKM9YA,38274
|
|
13
|
-
ttsforge/phonemes.py,sha256=EUZ1Qr-0rPThRpSeuJQe5Z3J3nz7rX1Xs3Rjjw19qIQ,15517
|
|
14
|
-
ttsforge/ssmd_generator.py,sha256=LknVBSETKH9cY4CoAUBjd1vEfr5VXi0xqEKcft2CR8I,13346
|
|
15
|
-
ttsforge/utils.py,sha256=3BiNFyScV3Dy_xhVm2EigpxUb4Z6YwIQPzzxwDzfCzI,24942
|
|
16
|
-
ttsforge/cli/__init__.py,sha256=CTqYeUAJaKV7YTYqcmr7-VxjwJfjLcnPYM2OKyws0Oc,2103
|
|
17
|
-
ttsforge/cli/commands_conversion.py,sha256=fvbC62gswEb-qpVoU_9S81nFOApH09v9g29nG0HE9X4,63421
|
|
18
|
-
ttsforge/cli/commands_phonemes.py,sha256=Y1_t4jLDuEFVLY6sQC9uwcU9ptt4Y7FrRBRWtkSAF0o,32371
|
|
19
|
-
ttsforge/cli/commands_utility.py,sha256=_8KMUjVYVqp63PH_gjOjS-fw6ZCujaMDXFxoKfUzlko,48013
|
|
20
|
-
ttsforge/cli/helpers.py,sha256=5Co2EvDhYspKhjW2-P3sNxj9MFFgWyTFeqOyJbPy2yA,2697
|
|
21
|
-
ttsforge/vocab/__init__.py,sha256=lMgS0dY9VbOYI20LnPjjqrWcjLIQ1FKkR4-xcXsvrqc,3641
|
|
22
|
-
ttsforge-0.1.1.dist-info/licenses/LICENSE,sha256=9csb1sDNn0HdUPKgOTUwtb4CkvYPcFXHnkxKCS99EWQ,1074
|
|
23
|
-
ttsforge-0.1.1.dist-info/METADATA,sha256=tVUrSm-pBcKWQ0gyrsazqadW0ufARWPXp00qHJsf6ag,19567
|
|
24
|
-
ttsforge-0.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
25
|
-
ttsforge-0.1.1.dist-info/entry_points.txt,sha256=SrcNdlhQpoUCzPzhVbOmMzATQeV7j7XYl0DPrVjZ-ks,47
|
|
26
|
-
ttsforge-0.1.1.dist-info/top_level.txt,sha256=rNLi-3muicHF8UvZu_FuA2ML_Dz9sVPCjik2E8XnCVk,9
|
|
27
|
-
ttsforge-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|