ttsforge 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.github/workflows/codecov.yml +1 -1
- {ttsforge-0.1.1 → ttsforge-0.1.2}/PKG-INFO +2 -1
- {ttsforge-0.1.1 → ttsforge-0.1.2}/README.md +1 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/cli.rst +3 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/configuration.rst +1 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_constants.py +1 -6
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/__init__.py +3 -18
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/_version.py +3 -3
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/commands_conversion.py +69 -3
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/commands_phonemes.py +19 -1
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/commands_utility.py +18 -1
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/helpers.py +1 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/constants.py +11 -2
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/conversion.py +103 -42
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/kokoro_runner.py +38 -5
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/phoneme_conversion.py +52 -1
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge.egg-info/PKG-INFO +2 -1
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.codecrate.toml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.coveragerc +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.github/pytest.ini +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.github/workflows/pre-commit.yml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.github/workflows/python-publish.yml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.github/workflows/tests.yml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.gitignore +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.pre-commit-config.yaml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.prettierrc.yml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.readthedocs.yaml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/.ruff.toml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/LICENSE +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/api/index.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/conf.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/filename_templates.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/index.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/installation.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/make.bat +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/make.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/quickstart.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/requirements.txt +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/ssmd.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/docs/voices.rst +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/examples/__init__.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/examples/phoneme_export.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/pyproject.toml +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/requirements-test.txt +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/setup.cfg +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/setup.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/__init__.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_chapter_announcement.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_chapter_marker_leading_space.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_chapter_selection.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_cli.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_cli_smoke.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_conversion.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_conversion_state.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_epub_chapter_markers.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_name_extractor.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_onnx_backend.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_phoneme_conversion.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_phoneme_dictionary.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_phonemes.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_ssmd_generator.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_tokenizer.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_utils.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/audio_merge.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/audio_player.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/chapter_selection.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/__init__.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/input_reader.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/kokoro_lang.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/name_extractor.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/phonemes.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/ssmd_generator.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/utils.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/vocab/__init__.py +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge.egg-info/SOURCES.txt +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge.egg-info/dependency_links.txt +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge.egg-info/entry_points.txt +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge.egg-info/requires.txt +0 -0
- {ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ttsforge
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Generate audiobooks from EPUB files using Kokoro ONNX TTS.
|
|
5
5
|
Author-email: Holger Nahrstaedt <nahrstaedt@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -617,6 +617,7 @@ ttsforge convert book.epub --gpu
|
|
|
617
617
|
| `pause_paragraph` | `0.9` | Paragraph pause (seconds) |
|
|
618
618
|
| `pause_variance` | `0.05` | Pause variance (seconds) |
|
|
619
619
|
| `pause_mode` | `auto` | Pause mode (`tts`, `manual`, `auto`) |
|
|
620
|
+
| `enable_short_sentence` | `None` | Handle short sentences |
|
|
620
621
|
| `announce_chapters` | `true` | Speak chapter titles |
|
|
621
622
|
| `chapter_pause_after_title` | `2.0` | Pause after chapter title |
|
|
622
623
|
| `phonemization_lang` | `None` | Override phonemization language |
|
|
@@ -554,6 +554,7 @@ ttsforge convert book.epub --gpu
|
|
|
554
554
|
| `pause_paragraph` | `0.9` | Paragraph pause (seconds) |
|
|
555
555
|
| `pause_variance` | `0.05` | Pause variance (seconds) |
|
|
556
556
|
| `pause_mode` | `auto` | Pause mode (`tts`, `manual`, `auto`) |
|
|
557
|
+
| `enable_short_sentence` | `None` | Handle short sentences |
|
|
557
558
|
| `announce_chapters` | `true` | Speak chapter titles |
|
|
558
559
|
| `chapter_pause_after_title` | `2.0` | Pause after chapter title |
|
|
559
560
|
| `phonemization_lang` | `None` | Override phonemization language |
|
|
@@ -85,6 +85,9 @@ Options
|
|
|
85
85
|
``--pause-mode MODE``
|
|
86
86
|
Pause mode: ``tts``, ``manual``, or ``auto``. Default: ``auto``.
|
|
87
87
|
|
|
88
|
+
``--enable-short-sentence``
|
|
89
|
+
Enable special handling for short sentences (less than 5 words).
|
|
90
|
+
|
|
88
91
|
``--announce-chapters / --no-announce-chapters``
|
|
89
92
|
Read chapter titles aloud before chapter content. Default: enabled.
|
|
90
93
|
|
|
@@ -452,6 +452,7 @@ Here's an example ``config.json`` with custom settings:
|
|
|
452
452
|
"pause_paragraph": 0.9,
|
|
453
453
|
"pause_variance": 0.05,
|
|
454
454
|
"pause_mode": "auto",
|
|
455
|
+
"enable_short_sentence": None,
|
|
455
456
|
"announce_chapters": true,
|
|
456
457
|
"chapter_pause_after_title": 2.0,
|
|
457
458
|
"save_chapters_separately": false,
|
|
@@ -20,7 +20,7 @@ class TestLanguageDescriptions:
|
|
|
20
20
|
|
|
21
21
|
def test_all_language_codes_have_descriptions(self):
|
|
22
22
|
"""All language codes should have descriptions."""
|
|
23
|
-
expected_codes = {"a", "b", "e", "f", "h", "i", "j", "p", "z"}
|
|
23
|
+
expected_codes = {"a", "b", "d", "e", "f", "h", "i", "j", "p", "z"}
|
|
24
24
|
assert set(LANGUAGE_DESCRIPTIONS.keys()) == expected_codes
|
|
25
25
|
|
|
26
26
|
def test_english_variants(self):
|
|
@@ -132,11 +132,6 @@ class TestDefaultVoiceForLang:
|
|
|
132
132
|
lang in DEFAULT_VOICE_FOR_LANG
|
|
133
133
|
), f"Language {lang} needs default voice"
|
|
134
134
|
|
|
135
|
-
def test_default_voices_exist_in_voices_list(self):
|
|
136
|
-
"""All default voices should exist in VOICES list."""
|
|
137
|
-
for lang, voice in DEFAULT_VOICE_FOR_LANG.items():
|
|
138
|
-
assert voice in VOICES, f"Default voice {voice} for {lang} not in VOICES"
|
|
139
|
-
|
|
140
135
|
def test_default_voices_match_language(self):
|
|
141
136
|
"""Default voices should match their language."""
|
|
142
137
|
for lang, voice in DEFAULT_VOICE_FOR_LANG.items():
|
|
@@ -18,7 +18,7 @@ from pykokoro.tokenizer import (
|
|
|
18
18
|
Tokenizer,
|
|
19
19
|
)
|
|
20
20
|
from pykokoro.constants import SUPPORTED_LANGUAGES
|
|
21
|
-
|
|
21
|
+
from pykokoro.onnx_backend import VOICE_NAMES_BY_VARIANT
|
|
22
22
|
from .constants import (
|
|
23
23
|
DEFAULT_CONFIG,
|
|
24
24
|
LANGUAGE_DESCRIPTIONS,
|
|
@@ -27,23 +27,7 @@ from .constants import (
|
|
|
27
27
|
)
|
|
28
28
|
|
|
29
29
|
# Import from pykokoro
|
|
30
|
-
|
|
31
|
-
from pykokoro.constants import SAMPLE_RATE
|
|
32
|
-
from pykokoro.onnx_backend import LANG_CODE_TO_ONNX
|
|
33
|
-
except ImportError:
|
|
34
|
-
# Fallback values if pykokoro not installed
|
|
35
|
-
SAMPLE_RATE = 24000
|
|
36
|
-
LANG_CODE_TO_ONNX = {
|
|
37
|
-
"a": "en-us",
|
|
38
|
-
"b": "en-gb",
|
|
39
|
-
"e": "es",
|
|
40
|
-
"f": "fr-fr",
|
|
41
|
-
"h": "hi",
|
|
42
|
-
"i": "it",
|
|
43
|
-
"j": "ja",
|
|
44
|
-
"p": "pt",
|
|
45
|
-
"z": "zh",
|
|
46
|
-
}
|
|
30
|
+
from pykokoro.constants import SAMPLE_RATE
|
|
47
31
|
|
|
48
32
|
from .conversion import (
|
|
49
33
|
Chapter,
|
|
@@ -73,6 +57,7 @@ __all__ = [
|
|
|
73
57
|
"LANGUAGE_DESCRIPTIONS",
|
|
74
58
|
"SUPPORTED_OUTPUT_FORMATS",
|
|
75
59
|
"VOICES",
|
|
60
|
+
"VOICE_NAMES_BY_VARIANT",
|
|
76
61
|
# Conversion
|
|
77
62
|
"Chapter",
|
|
78
63
|
"ConversionOptions",
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'gb31ed0898'
|
|
@@ -17,6 +17,7 @@ from typing import Literal, TypedDict, cast
|
|
|
17
17
|
|
|
18
18
|
import click
|
|
19
19
|
import numpy as np
|
|
20
|
+
from pykokoro.onnx_backend import DEFAULT_MODEL_QUALITY, ModelQuality
|
|
20
21
|
from rich.panel import Panel
|
|
21
22
|
from rich.progress import (
|
|
22
23
|
BarColumn,
|
|
@@ -37,7 +38,6 @@ from ..constants import (
|
|
|
37
38
|
LANGUAGE_DESCRIPTIONS,
|
|
38
39
|
SUPPORTED_OUTPUT_FORMATS,
|
|
39
40
|
VOICE_PREFIX_TO_LANG,
|
|
40
|
-
VOICES,
|
|
41
41
|
)
|
|
42
42
|
from ..conversion import (
|
|
43
43
|
Chapter,
|
|
@@ -54,6 +54,7 @@ from ..utils import (
|
|
|
54
54
|
load_config,
|
|
55
55
|
resolve_conversion_defaults,
|
|
56
56
|
)
|
|
57
|
+
from .commands_utility import _resolve_model_source_and_variant, _resolve_voice_names
|
|
57
58
|
from .helpers import DEFAULT_SAMPLE_TEXT, console, parse_voice_parameter
|
|
58
59
|
|
|
59
60
|
|
|
@@ -64,6 +65,14 @@ class ContentItem(TypedDict):
|
|
|
64
65
|
page_number: NotRequired[int]
|
|
65
66
|
|
|
66
67
|
|
|
68
|
+
def get_voices() -> list[str]:
|
|
69
|
+
"""Get the list of available voices."""
|
|
70
|
+
cfg = load_config()
|
|
71
|
+
|
|
72
|
+
model_source, model_variant = _resolve_model_source_and_variant(cfg)
|
|
73
|
+
return _resolve_voice_names(model_source, model_variant)
|
|
74
|
+
|
|
75
|
+
|
|
67
76
|
@click.command()
|
|
68
77
|
@click.argument("epub_file", type=click.Path(exists=True, path_type=Path))
|
|
69
78
|
@click.option(
|
|
@@ -82,7 +91,7 @@ class ContentItem(TypedDict):
|
|
|
82
91
|
@click.option(
|
|
83
92
|
"-v",
|
|
84
93
|
"--voice",
|
|
85
|
-
type=click.Choice(
|
|
94
|
+
type=click.Choice(get_voices()),
|
|
86
95
|
help="Voice to use for TTS.",
|
|
87
96
|
)
|
|
88
97
|
@click.option(
|
|
@@ -150,6 +159,12 @@ class ContentItem(TypedDict):
|
|
|
150
159
|
default=None,
|
|
151
160
|
help="Pause mode: 'tts', 'manual', or 'auto' (default: auto).",
|
|
152
161
|
)
|
|
162
|
+
@click.option(
|
|
163
|
+
"--enable-short-sentence/--disable-short-sentence",
|
|
164
|
+
"enable_short_sentence",
|
|
165
|
+
default=None,
|
|
166
|
+
help="Enable/disable special handling for short sentences.",
|
|
167
|
+
)
|
|
153
168
|
@click.option(
|
|
154
169
|
"--announce-chapters/--no-announce-chapters",
|
|
155
170
|
"announce_chapters",
|
|
@@ -296,6 +311,7 @@ def convert( # noqa: C901
|
|
|
296
311
|
pause_paragraph: float | None,
|
|
297
312
|
pause_variance: float | None,
|
|
298
313
|
pause_mode: str | None,
|
|
314
|
+
enable_short_sentence: bool | None,
|
|
299
315
|
announce_chapters: bool | None,
|
|
300
316
|
chapter_pause: float | None,
|
|
301
317
|
title: str | None,
|
|
@@ -325,6 +341,10 @@ def convert( # noqa: C901
|
|
|
325
341
|
config = load_config()
|
|
326
342
|
model_path = ctx.obj.get("model_path") if ctx.obj else None
|
|
327
343
|
voices_path = ctx.obj.get("voices_path") if ctx.obj else None
|
|
344
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
345
|
+
model_quality = cast(
|
|
346
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
347
|
+
)
|
|
328
348
|
|
|
329
349
|
# Get format first (needed for output path construction)
|
|
330
350
|
fmt = output_format or config.get("default_format", "m4b")
|
|
@@ -467,6 +487,9 @@ def convert( # noqa: C901
|
|
|
467
487
|
language=language or "a",
|
|
468
488
|
speed=speed or config.get("default_speed", 1.0),
|
|
469
489
|
use_gpu=use_gpu if use_gpu is not None else config.get("use_gpu", False),
|
|
490
|
+
model_source=model_source,
|
|
491
|
+
model_variant=model_variant,
|
|
492
|
+
model_quality=model_quality,
|
|
470
493
|
num_chapters=len(selected_indices) if selected_indices else len(epub_chapters),
|
|
471
494
|
title=effective_title,
|
|
472
495
|
author=effective_author,
|
|
@@ -510,6 +533,9 @@ def convert( # noqa: C901
|
|
|
510
533
|
output_format=output_format or config.get("default_format", "m4b"),
|
|
511
534
|
output_dir=output.parent,
|
|
512
535
|
use_gpu=use_gpu if use_gpu is not None else config.get("use_gpu", False),
|
|
536
|
+
model_quality=model_quality,
|
|
537
|
+
model_source=model_source,
|
|
538
|
+
model_variant=model_variant,
|
|
513
539
|
silence_between_chapters=silence or config.get("silence_between_chapters", 2.0),
|
|
514
540
|
lang=lang or config.get("phonemization_lang"),
|
|
515
541
|
use_mixed_language=(
|
|
@@ -556,6 +582,11 @@ def convert( # noqa: C901
|
|
|
556
582
|
pause_mode=(
|
|
557
583
|
pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
|
|
558
584
|
),
|
|
585
|
+
enable_short_sentence=(
|
|
586
|
+
enable_short_sentence
|
|
587
|
+
if enable_short_sentence is not None
|
|
588
|
+
else config.get("enable_short_sentence", None)
|
|
589
|
+
),
|
|
559
590
|
announce_chapters=(
|
|
560
591
|
announce_chapters
|
|
561
592
|
if announce_chapters is not None
|
|
@@ -947,6 +978,10 @@ def sample(
|
|
|
947
978
|
|
|
948
979
|
# Load config for defaults
|
|
949
980
|
user_config = load_config()
|
|
981
|
+
model_source, model_variant = _resolve_model_source_and_variant(user_config)
|
|
982
|
+
model_quality = cast(
|
|
983
|
+
ModelQuality, user_config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
984
|
+
)
|
|
950
985
|
resolved_defaults = resolve_conversion_defaults(
|
|
951
986
|
user_config,
|
|
952
987
|
{
|
|
@@ -980,6 +1015,9 @@ def sample(
|
|
|
980
1015
|
use_gpu=resolved_defaults["use_gpu"],
|
|
981
1016
|
split_mode=resolved_defaults["split_mode"],
|
|
982
1017
|
lang=resolved_defaults["lang"],
|
|
1018
|
+
model_quality=model_quality,
|
|
1019
|
+
model_source=model_source,
|
|
1020
|
+
model_variant=model_variant,
|
|
983
1021
|
use_mixed_language=(
|
|
984
1022
|
use_mixed_language or user_config.get("use_mixed_language", False)
|
|
985
1023
|
),
|
|
@@ -1117,6 +1155,9 @@ def _show_conversion_summary(
|
|
|
1117
1155
|
language: str,
|
|
1118
1156
|
speed: float,
|
|
1119
1157
|
use_gpu: bool,
|
|
1158
|
+
model_source: str,
|
|
1159
|
+
model_variant: str,
|
|
1160
|
+
model_quality: str | None,
|
|
1120
1161
|
num_chapters: int,
|
|
1121
1162
|
title: str,
|
|
1122
1163
|
author: str,
|
|
@@ -1139,6 +1180,9 @@ def _show_conversion_summary(
|
|
|
1139
1180
|
table.add_row("Chapters", str(num_chapters))
|
|
1140
1181
|
table.add_row("Voice", voice)
|
|
1141
1182
|
table.add_row("Language", LANGUAGE_DESCRIPTIONS.get(language, language))
|
|
1183
|
+
table.add_row("Model Source", model_source)
|
|
1184
|
+
table.add_row("Model Variant", model_variant)
|
|
1185
|
+
table.add_row("Model Quality", str(model_quality))
|
|
1142
1186
|
if lang:
|
|
1143
1187
|
table.add_row("Phonemization Lang", f"{lang} (override)")
|
|
1144
1188
|
if use_mixed_language:
|
|
@@ -1167,7 +1211,7 @@ def _show_conversion_summary(
|
|
|
1167
1211
|
@click.option(
|
|
1168
1212
|
"-v",
|
|
1169
1213
|
"--voice",
|
|
1170
|
-
type=click.Choice(
|
|
1214
|
+
type=click.Choice(get_voices()),
|
|
1171
1215
|
help="TTS voice to use.",
|
|
1172
1216
|
)
|
|
1173
1217
|
@click.option(
|
|
@@ -1271,6 +1315,11 @@ def _show_conversion_summary(
|
|
|
1271
1315
|
default=None,
|
|
1272
1316
|
help="Trim leading/trailing silence from audio.",
|
|
1273
1317
|
)
|
|
1318
|
+
@click.option(
|
|
1319
|
+
"--enable-short-sentence/--disable-short-sentence",
|
|
1320
|
+
default=None,
|
|
1321
|
+
help="Enable special handling for short sentences.",
|
|
1322
|
+
)
|
|
1274
1323
|
@click.pass_context
|
|
1275
1324
|
def read( # noqa: C901
|
|
1276
1325
|
ctx: click.Context,
|
|
@@ -1293,6 +1342,7 @@ def read( # noqa: C901
|
|
|
1293
1342
|
pause_paragraph: float | None,
|
|
1294
1343
|
pause_variance: float | None,
|
|
1295
1344
|
pause_mode: str | None,
|
|
1345
|
+
enable_short_sentence: bool | None,
|
|
1296
1346
|
) -> None:
|
|
1297
1347
|
"""Read an EPUB or text file aloud with streaming playback.
|
|
1298
1348
|
|
|
@@ -1340,6 +1390,10 @@ def read( # noqa: C901
|
|
|
1340
1390
|
|
|
1341
1391
|
# Load config for defaults
|
|
1342
1392
|
config = load_config()
|
|
1393
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
1394
|
+
model_quality = cast(
|
|
1395
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
1396
|
+
)
|
|
1343
1397
|
resolved_defaults = resolve_conversion_defaults(
|
|
1344
1398
|
config,
|
|
1345
1399
|
{
|
|
@@ -1389,6 +1443,11 @@ def read( # noqa: C901
|
|
|
1389
1443
|
effective_pause_mode = (
|
|
1390
1444
|
pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
|
|
1391
1445
|
)
|
|
1446
|
+
effective_enable_short_sentence = (
|
|
1447
|
+
enable_short_sentence
|
|
1448
|
+
if enable_short_sentence is not None
|
|
1449
|
+
else config.get("enable_short_sentence", None)
|
|
1450
|
+
)
|
|
1392
1451
|
|
|
1393
1452
|
# Get language code for TTS
|
|
1394
1453
|
espeak_lang = LANG_CODE_TO_ONNX.get(effective_language, "en-us")
|
|
@@ -1645,11 +1704,15 @@ def read( # noqa: C901
|
|
|
1645
1704
|
model_path=model_path,
|
|
1646
1705
|
voices_path=voices_path,
|
|
1647
1706
|
use_gpu=effective_use_gpu,
|
|
1707
|
+
model_quality=model_quality,
|
|
1708
|
+
model_source=model_source,
|
|
1709
|
+
model_variant=model_variant,
|
|
1648
1710
|
)
|
|
1649
1711
|
generation = GenerationConfig(
|
|
1650
1712
|
speed=effective_speed,
|
|
1651
1713
|
lang=espeak_lang,
|
|
1652
1714
|
pause_mode=cast(Literal["tts", "manual", "auto"], effective_pause_mode),
|
|
1715
|
+
enable_short_sentence=effective_enable_short_sentence,
|
|
1653
1716
|
pause_clause=effective_pause_clause,
|
|
1654
1717
|
pause_sentence=effective_pause_sentence,
|
|
1655
1718
|
pause_paragraph=effective_pause_paragraph,
|
|
@@ -1658,6 +1721,9 @@ def read( # noqa: C901
|
|
|
1658
1721
|
pipeline_config = PipelineConfig(
|
|
1659
1722
|
voice=effective_voice,
|
|
1660
1723
|
generation=generation,
|
|
1724
|
+
model_quality=model_quality,
|
|
1725
|
+
model_source=model_source,
|
|
1726
|
+
model_variant=model_variant,
|
|
1661
1727
|
model_path=model_path,
|
|
1662
1728
|
voices_path=voices_path,
|
|
1663
1729
|
)
|
|
@@ -10,9 +10,10 @@ This module contains commands for working with phonemes and pre-tokenized conten
|
|
|
10
10
|
import re
|
|
11
11
|
import sys
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any, cast
|
|
14
14
|
|
|
15
15
|
import click
|
|
16
|
+
from pykokoro.onnx_backend import DEFAULT_MODEL_QUALITY, ModelQuality
|
|
16
17
|
from rich.progress import (
|
|
17
18
|
BarColumn,
|
|
18
19
|
Progress,
|
|
@@ -37,6 +38,7 @@ from ..utils import (
|
|
|
37
38
|
format_filename_template,
|
|
38
39
|
load_config,
|
|
39
40
|
)
|
|
41
|
+
from .commands_utility import _resolve_model_source_and_variant
|
|
40
42
|
from .helpers import console, parse_voice_parameter
|
|
41
43
|
|
|
42
44
|
|
|
@@ -500,6 +502,10 @@ def phonemes_convert(
|
|
|
500
502
|
config = load_config()
|
|
501
503
|
model_path = ctx.obj.get("model_path") if ctx.obj else None
|
|
502
504
|
voices_path = ctx.obj.get("voices_path") if ctx.obj else None
|
|
505
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
506
|
+
model_quality = cast(
|
|
507
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
508
|
+
)
|
|
503
509
|
|
|
504
510
|
# Get book info and metadata
|
|
505
511
|
book_info = book.get_info()
|
|
@@ -599,6 +605,9 @@ def phonemes_convert(
|
|
|
599
605
|
speed=speed,
|
|
600
606
|
output_format=fmt,
|
|
601
607
|
use_gpu=gpu,
|
|
608
|
+
model_quality=model_quality,
|
|
609
|
+
model_source=model_source,
|
|
610
|
+
model_variant=model_variant,
|
|
602
611
|
silence_between_chapters=silence,
|
|
603
612
|
pause_clause=(
|
|
604
613
|
pause_clause
|
|
@@ -834,6 +843,12 @@ def phonemes_preview(
|
|
|
834
843
|
# Auto-detect if voice is a blend
|
|
835
844
|
parsed_voice, parsed_voice_blend = parse_voice_parameter(voice)
|
|
836
845
|
|
|
846
|
+
config = load_config()
|
|
847
|
+
model_source, model_variant = _resolve_model_source_and_variant(config)
|
|
848
|
+
model_quality = cast(
|
|
849
|
+
ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
850
|
+
)
|
|
851
|
+
|
|
837
852
|
# Initialize converter
|
|
838
853
|
options = ConversionOptions(
|
|
839
854
|
phoneme_dictionary_path=str(phoneme_dict) if phoneme_dict else None,
|
|
@@ -841,6 +856,9 @@ def phonemes_preview(
|
|
|
841
856
|
voice_blend=parsed_voice_blend,
|
|
842
857
|
language=language,
|
|
843
858
|
output_format="wav", # Explicitly set WAV format
|
|
859
|
+
model_quality=model_quality,
|
|
860
|
+
model_source=model_source,
|
|
861
|
+
model_variant=model_variant,
|
|
844
862
|
)
|
|
845
863
|
converter = TTSConverter(options)
|
|
846
864
|
|
|
@@ -555,6 +555,14 @@ def _resolve_model_source_and_variant(cfg: dict) -> tuple[ModelSource, ModelVari
|
|
|
555
555
|
return cast(ModelSource, source), cast(ModelVariant, variant)
|
|
556
556
|
|
|
557
557
|
|
|
558
|
+
def _resolve_voice_names(
|
|
559
|
+
model_source: ModelSource = "huggingface",
|
|
560
|
+
model_variant: ModelVariant = "v1.0",
|
|
561
|
+
) -> list[str]:
|
|
562
|
+
"""Return the list of voice names for the given model variant."""
|
|
563
|
+
return VOICE_NAMES_BY_VARIANT.get(model_variant, VOICE_NAMES)
|
|
564
|
+
|
|
565
|
+
|
|
558
566
|
def _get_cache_voices_path(
|
|
559
567
|
model_source: ModelSource,
|
|
560
568
|
model_variant: ModelVariant,
|
|
@@ -708,7 +716,7 @@ def download(ctx: click.Context, force: bool, quality: str | None) -> None:
|
|
|
708
716
|
|
|
709
717
|
# ---- voices
|
|
710
718
|
if model_source == "huggingface":
|
|
711
|
-
voice_names =
|
|
719
|
+
voice_names = _resolve_voice_names(model_source, model_variant)
|
|
712
720
|
total_voices = len(voice_names)
|
|
713
721
|
voices_task = progress.add_task(
|
|
714
722
|
f"Downloading voices (0/{total_voices})...", total=total_voices
|
|
@@ -1269,6 +1277,12 @@ def list_names( # noqa: C901
|
|
|
1269
1277
|
)
|
|
1270
1278
|
console.print("[dim]Type 'q' to quit, 's' to skip, 'r' to replay.[/dim]\n")
|
|
1271
1279
|
|
|
1280
|
+
cfg = load_config()
|
|
1281
|
+
model_source, model_variant = _resolve_model_source_and_variant(cfg)
|
|
1282
|
+
model_quality = cast(
|
|
1283
|
+
ModelQuality, cfg.get("model_quality", DEFAULT_MODEL_QUALITY)
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1272
1286
|
# Initialize converter with phoneme dictionary
|
|
1273
1287
|
try:
|
|
1274
1288
|
# Auto-detect if voice is a blend
|
|
@@ -1279,6 +1293,9 @@ def list_names( # noqa: C901
|
|
|
1279
1293
|
voice=parsed_voice or "af_sky",
|
|
1280
1294
|
voice_blend=parsed_voice_blend,
|
|
1281
1295
|
language=language,
|
|
1296
|
+
model_quality=model_quality,
|
|
1297
|
+
model_source=model_source,
|
|
1298
|
+
model_variant=model_variant,
|
|
1282
1299
|
)
|
|
1283
1300
|
converter = TTSConverter(options)
|
|
1284
1301
|
|
|
@@ -50,6 +50,7 @@ DEFAULT_SAMPLE_TEXT = (
|
|
|
50
50
|
DEMO_TEXT = {
|
|
51
51
|
"a": "Hello! This audio was generated by {voice}. How do you like it?",
|
|
52
52
|
"b": "Hello! This audio was generated by {voice}. How do you like it?",
|
|
53
|
+
"d": "Hallo! Dieses Audio wurde von {voice} erzeugt. Wie gefallt es Ihnen?",
|
|
53
54
|
"e": "Hola! Este audio fue generado por {voice}. Que te parece?",
|
|
54
55
|
"f": "Bonjour! Cet audio a ete genere par {voice}. Comment le trouvez-vous?",
|
|
55
56
|
"h": "Namaste! Yah audio {voice} dwara banaya gaya hai. Aapko kaisa laga?",
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
# from pykokoro.onnx_backend import VOICE_NAMES_V1_0
|
|
4
4
|
# from pykokoro.onnx_backend import VOICE_NAMES_V1_1_ZH, VOICE_NAMES_V1_1_DE
|
|
5
5
|
|
|
6
|
-
from pykokoro.onnx_backend import VOICE_NAMES_V1_0
|
|
6
|
+
from pykokoro.onnx_backend import DEFAULT_MODEL_SOURCE, VOICE_NAMES_V1_0
|
|
7
7
|
|
|
8
8
|
# Re-export from pykokoro for convenience
|
|
9
|
-
VOICES =
|
|
9
|
+
VOICES = VOICE_NAMES_V1_0
|
|
10
10
|
|
|
11
11
|
# Audio constants from pykokoro
|
|
12
12
|
try:
|
|
@@ -24,6 +24,7 @@ PROGRAM_DESCRIPTION = "Generate audiobooks from EPUB files using Kokoro ONNX TTS
|
|
|
24
24
|
LANGUAGE_DESCRIPTIONS = {
|
|
25
25
|
"a": "American English",
|
|
26
26
|
"b": "British English",
|
|
27
|
+
"d": "German",
|
|
27
28
|
"e": "Spanish",
|
|
28
29
|
"f": "French",
|
|
29
30
|
"h": "Hindi",
|
|
@@ -35,6 +36,8 @@ LANGUAGE_DESCRIPTIONS = {
|
|
|
35
36
|
|
|
36
37
|
# ISO language code to ttsforge language code mapping
|
|
37
38
|
ISO_TO_LANG_CODE = {
|
|
39
|
+
"de": "d",
|
|
40
|
+
"de-de": "d",
|
|
38
41
|
"en": "a", # Default to American English
|
|
39
42
|
"en-us": "a",
|
|
40
43
|
"en-gb": "b",
|
|
@@ -62,6 +65,8 @@ VOICE_PREFIX_TO_LANG = {
|
|
|
62
65
|
"am": "a", # American Male
|
|
63
66
|
"bf": "b", # British Female
|
|
64
67
|
"bm": "b", # British Male
|
|
68
|
+
"df": "d", # German Female
|
|
69
|
+
"dm": "d", # German Male
|
|
65
70
|
"ef": "e", # Spanish Female
|
|
66
71
|
"em": "e", # Spanish Male
|
|
67
72
|
"ff": "f", # French Female
|
|
@@ -82,6 +87,7 @@ VOICE_PREFIX_TO_LANG = {
|
|
|
82
87
|
DEFAULT_VOICE_FOR_LANG = {
|
|
83
88
|
"a": "af_heart",
|
|
84
89
|
"b": "bf_emma",
|
|
90
|
+
"d": "df_eva",
|
|
85
91
|
"e": "ef_dora",
|
|
86
92
|
"f": "ff_siwis",
|
|
87
93
|
"h": "hf_alpha",
|
|
@@ -115,6 +121,7 @@ DEFAULT_CONFIG = {
|
|
|
115
121
|
"use_gpu": False, # GPU requires onnxruntime-gpu
|
|
116
122
|
# Model quality: fp32, fp16, q8, q8f16, q4, q4f16, uint8, uint8f16
|
|
117
123
|
"model_quality": "fp32",
|
|
124
|
+
"model_source": DEFAULT_MODEL_SOURCE,
|
|
118
125
|
"model_variant": "v1.0",
|
|
119
126
|
"silence_between_chapters": 2.0,
|
|
120
127
|
"save_chapters_separately": False,
|
|
@@ -128,6 +135,7 @@ DEFAULT_CONFIG = {
|
|
|
128
135
|
"pause_paragraph": 0.9,
|
|
129
136
|
"pause_variance": 0.05,
|
|
130
137
|
"pause_mode": "auto", # "tts", "manual", or "auto
|
|
138
|
+
"enable_short_sentence": None,
|
|
131
139
|
# Language override for phonemization (e.g., 'de', 'fr', 'en-us')
|
|
132
140
|
# If None, language is determined from voice prefix
|
|
133
141
|
"phonemization_lang": None,
|
|
@@ -154,6 +162,7 @@ AUDIO_CHANNELS = 1
|
|
|
154
162
|
SAMPLE_TEXTS = {
|
|
155
163
|
"a": "This is a sample of the selected voice.",
|
|
156
164
|
"b": "This is a sample of the selected voice.",
|
|
165
|
+
"d": "Dies ist ein Beispiel für die ausgewählte Stimme.",
|
|
157
166
|
"e": "Este es una muestra de la voz seleccionada.",
|
|
158
167
|
"f": "Ceci est un exemple de la voix sélectionnée.",
|
|
159
168
|
"h": "यह चयनित आवाज़ का एक नमूना है।", # noqa: E501
|