PyPI - ttsforge - Versions diffs - 0.1.1__tar.gz → 0.1.2__tar.gz - Mend

ttsforge 0.1.1tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

{ttsforge-0.1.1 → ttsforge-0.1.2}/.github/workflows/codecov.yml RENAMED Viewed

@@ -12,7 +12,7 @@ jobs:
     - name: Install espeak-ng
       run: |
         sudo apt-get update
-        sudo apt-get install -y espeak-ng
+        sudo apt-get install -y espeak-ng ffmpeg
     - name: 'generate report'
       run: |
         pip install coverage click pytest pytest-cov

{ttsforge-0.1.1 → ttsforge-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ttsforge
-Version: 0.1.1
+Version: 0.1.2
 Summary: Generate audiobooks from EPUB files using Kokoro ONNX TTS.
 Author-email: Holger Nahrstaedt <nahrstaedt@gmail.com>
 License: MIT License
@@ -617,6 +617,7 @@ ttsforge convert book.epub --gpu
 | `pause_paragraph`           | `0.9`          | Paragraph pause (seconds)            |
 | `pause_variance`            | `0.05`         | Pause variance (seconds)             |
 | `pause_mode`                | `auto`         | Pause mode (`tts`, `manual`, `auto`) |
+| `enable_short_sentence`     | `None`         | Handle short sentences               |
 | `announce_chapters`         | `true`         | Speak chapter titles                 |
 | `chapter_pause_after_title` | `2.0`          | Pause after chapter title            |
 | `phonemization_lang`        | `None`         | Override phonemization language      |

{ttsforge-0.1.1 → ttsforge-0.1.2}/README.md RENAMED Viewed

@@ -554,6 +554,7 @@ ttsforge convert book.epub --gpu
 | `pause_paragraph`           | `0.9`          | Paragraph pause (seconds)            |
 | `pause_variance`            | `0.05`         | Pause variance (seconds)             |
 | `pause_mode`                | `auto`         | Pause mode (`tts`, `manual`, `auto`) |
+| `enable_short_sentence`     | `None`         | Handle short sentences               |
 | `announce_chapters`         | `true`         | Speak chapter titles                 |
 | `chapter_pause_after_title` | `2.0`          | Pause after chapter title            |
 | `phonemization_lang`        | `None`         | Override phonemization language      |

{ttsforge-0.1.1 → ttsforge-0.1.2}/docs/cli.rst RENAMED Viewed

@@ -85,6 +85,9 @@ Options
 ``--pause-mode MODE``
    Pause mode: ``tts``, ``manual``, or ``auto``. Default: ``auto``.
+``--enable-short-sentence``
+   Enable special handling for short sentences (less than 5 words).
 ``--announce-chapters / --no-announce-chapters``
    Read chapter titles aloud before chapter content. Default: enabled.

{ttsforge-0.1.1 → ttsforge-0.1.2}/docs/configuration.rst RENAMED Viewed

@@ -452,6 +452,7 @@ Here's an example ``config.json`` with custom settings:
      "pause_paragraph": 0.9,
      "pause_variance": 0.05,
      "pause_mode": "auto",
+     "enable_short_sentence": None,
      "announce_chapters": true,
      "chapter_pause_after_title": 2.0,
       "save_chapters_separately": false,

{ttsforge-0.1.1 → ttsforge-0.1.2}/tests/test_constants.py RENAMED Viewed

@@ -20,7 +20,7 @@ class TestLanguageDescriptions:
     def test_all_language_codes_have_descriptions(self):
         """All language codes should have descriptions."""
-        expected_codes = {"a", "b", "e", "f", "h", "i", "j", "p", "z"}
+        expected_codes = {"a", "b", "d", "e", "f", "h", "i", "j", "p", "z"}
         assert set(LANGUAGE_DESCRIPTIONS.keys()) == expected_codes
     def test_english_variants(self):
@@ -132,11 +132,6 @@ class TestDefaultVoiceForLang:
                 lang in DEFAULT_VOICE_FOR_LANG
             ), f"Language {lang} needs default voice"
-    def test_default_voices_exist_in_voices_list(self):
-        """All default voices should exist in VOICES list."""
-        for lang, voice in DEFAULT_VOICE_FOR_LANG.items():
-            assert voice in VOICES, f"Default voice {voice} for {lang} not in VOICES"
     def test_default_voices_match_language(self):
         """Default voices should match their language."""
         for lang, voice in DEFAULT_VOICE_FOR_LANG.items():

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@ from pykokoro.tokenizer import (
     Tokenizer,
 )
 from pykokoro.constants import SUPPORTED_LANGUAGES
+from pykokoro.onnx_backend import VOICE_NAMES_BY_VARIANT
 from .constants import (
     DEFAULT_CONFIG,
     LANGUAGE_DESCRIPTIONS,
@@ -27,23 +27,7 @@ from .constants import (
 )
 # Import from pykokoro
-try:
-    from pykokoro.constants import SAMPLE_RATE
-    from pykokoro.onnx_backend import LANG_CODE_TO_ONNX
-except ImportError:
-    # Fallback values if pykokoro not installed
-    SAMPLE_RATE = 24000
-    LANG_CODE_TO_ONNX = {
-        "a": "en-us",
-        "b": "en-gb",
-        "e": "es",
-        "f": "fr-fr",
-        "h": "hi",
-        "i": "it",
-        "j": "ja",
-        "p": "pt",
-        "z": "zh",
-    }
+from pykokoro.constants import SAMPLE_RATE
 from .conversion import (
     Chapter,
@@ -73,6 +57,7 @@ __all__ = [
     "LANGUAGE_DESCRIPTIONS",
     "SUPPORTED_OUTPUT_FORMATS",
     "VOICES",
+    "VOICE_NAMES_BY_VARIANT",
     # Conversion
     "Chapter",
     "ConversionOptions",

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.1'
-__version_tuple__ = version_tuple = (0, 1, 1)
+__version__ = version = '0.1.2'
+__version_tuple__ = version_tuple = (0, 1, 2)
-__commit_id__ = commit_id = 'g08367e850'
+__commit_id__ = commit_id = 'gb31ed0898'

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/commands_conversion.py RENAMED Viewed

@@ -17,6 +17,7 @@ from typing import Literal, TypedDict, cast
 import click
 import numpy as np
+from pykokoro.onnx_backend import DEFAULT_MODEL_QUALITY, ModelQuality
 from rich.panel import Panel
 from rich.progress import (
     BarColumn,
@@ -37,7 +38,6 @@ from ..constants import (
     LANGUAGE_DESCRIPTIONS,
     SUPPORTED_OUTPUT_FORMATS,
     VOICE_PREFIX_TO_LANG,
-    VOICES,
 )
 from ..conversion import (
     Chapter,
@@ -54,6 +54,7 @@ from ..utils import (
     load_config,
     resolve_conversion_defaults,
 )
+from .commands_utility import _resolve_model_source_and_variant, _resolve_voice_names
 from .helpers import DEFAULT_SAMPLE_TEXT, console, parse_voice_parameter
@@ -64,6 +65,14 @@ class ContentItem(TypedDict):
     page_number: NotRequired[int]
+def get_voices() -> list[str]:
+    """Get the list of available voices."""
+    cfg = load_config()
+    model_source, model_variant = _resolve_model_source_and_variant(cfg)
+    return _resolve_voice_names(model_source, model_variant)
 @click.command()
 @click.argument("epub_file", type=click.Path(exists=True, path_type=Path))
 @click.option(
@@ -82,7 +91,7 @@ class ContentItem(TypedDict):
 @click.option(
     "-v",
     "--voice",
-    type=click.Choice(VOICES),
+    type=click.Choice(get_voices()),
     help="Voice to use for TTS.",
 )
 @click.option(
@@ -150,6 +159,12 @@ class ContentItem(TypedDict):
     default=None,
     help="Pause mode: 'tts', 'manual', or 'auto' (default: auto).",
 )
+@click.option(
+    "--enable-short-sentence/--disable-short-sentence",
+    "enable_short_sentence",
+    default=None,
+    help="Enable/disable special handling for short sentences.",
+)
 @click.option(
     "--announce-chapters/--no-announce-chapters",
     "announce_chapters",
@@ -296,6 +311,7 @@ def convert(  # noqa: C901
     pause_paragraph: float | None,
     pause_variance: float | None,
     pause_mode: str | None,
+    enable_short_sentence: bool | None,
     announce_chapters: bool | None,
     chapter_pause: float | None,
     title: str | None,
@@ -325,6 +341,10 @@ def convert(  # noqa: C901
     config = load_config()
     model_path = ctx.obj.get("model_path") if ctx.obj else None
     voices_path = ctx.obj.get("voices_path") if ctx.obj else None
+    model_source, model_variant = _resolve_model_source_and_variant(config)
+    model_quality = cast(
+        ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
+    )
     # Get format first (needed for output path construction)
     fmt = output_format or config.get("default_format", "m4b")
@@ -467,6 +487,9 @@ def convert(  # noqa: C901
         language=language or "a",
         speed=speed or config.get("default_speed", 1.0),
         use_gpu=use_gpu if use_gpu is not None else config.get("use_gpu", False),
+        model_source=model_source,
+        model_variant=model_variant,
+        model_quality=model_quality,
         num_chapters=len(selected_indices) if selected_indices else len(epub_chapters),
         title=effective_title,
         author=effective_author,
@@ -510,6 +533,9 @@ def convert(  # noqa: C901
         output_format=output_format or config.get("default_format", "m4b"),
         output_dir=output.parent,
         use_gpu=use_gpu if use_gpu is not None else config.get("use_gpu", False),
+        model_quality=model_quality,
+        model_source=model_source,
+        model_variant=model_variant,
         silence_between_chapters=silence or config.get("silence_between_chapters", 2.0),
         lang=lang or config.get("phonemization_lang"),
         use_mixed_language=(
@@ -556,6 +582,11 @@ def convert(  # noqa: C901
         pause_mode=(
             pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
         ),
+        enable_short_sentence=(
+            enable_short_sentence
+            if enable_short_sentence is not None
+            else config.get("enable_short_sentence", None)
+        ),
         announce_chapters=(
             announce_chapters
             if announce_chapters is not None
@@ -947,6 +978,10 @@ def sample(
     # Load config for defaults
     user_config = load_config()
+    model_source, model_variant = _resolve_model_source_and_variant(user_config)
+    model_quality = cast(
+        ModelQuality, user_config.get("model_quality", DEFAULT_MODEL_QUALITY)
+    )
     resolved_defaults = resolve_conversion_defaults(
         user_config,
         {
@@ -980,6 +1015,9 @@ def sample(
         use_gpu=resolved_defaults["use_gpu"],
         split_mode=resolved_defaults["split_mode"],
         lang=resolved_defaults["lang"],
+        model_quality=model_quality,
+        model_source=model_source,
+        model_variant=model_variant,
         use_mixed_language=(
             use_mixed_language or user_config.get("use_mixed_language", False)
         ),
@@ -1117,6 +1155,9 @@ def _show_conversion_summary(
     language: str,
     speed: float,
     use_gpu: bool,
+    model_source: str,
+    model_variant: str,
+    model_quality: str | None,
     num_chapters: int,
     title: str,
     author: str,
@@ -1139,6 +1180,9 @@ def _show_conversion_summary(
     table.add_row("Chapters", str(num_chapters))
     table.add_row("Voice", voice)
     table.add_row("Language", LANGUAGE_DESCRIPTIONS.get(language, language))
+    table.add_row("Model Source", model_source)
+    table.add_row("Model Variant", model_variant)
+    table.add_row("Model Quality", str(model_quality))
     if lang:
         table.add_row("Phonemization Lang", f"{lang} (override)")
     if use_mixed_language:
@@ -1167,7 +1211,7 @@ def _show_conversion_summary(
 @click.option(
     "-v",
     "--voice",
-    type=click.Choice(VOICES),
+    type=click.Choice(get_voices()),
     help="TTS voice to use.",
 )
 @click.option(
@@ -1271,6 +1315,11 @@ def _show_conversion_summary(
     default=None,
     help="Trim leading/trailing silence from audio.",
 )
+@click.option(
+    "--enable-short-sentence/--disable-short-sentence",
+    default=None,
+    help="Enable special handling for short sentences.",
+)
 @click.pass_context
 def read(  # noqa: C901
     ctx: click.Context,
@@ -1293,6 +1342,7 @@ def read(  # noqa: C901
     pause_paragraph: float | None,
     pause_variance: float | None,
     pause_mode: str | None,
+    enable_short_sentence: bool | None,
 ) -> None:
     """Read an EPUB or text file aloud with streaming playback.
@@ -1340,6 +1390,10 @@ def read(  # noqa: C901
     # Load config for defaults
     config = load_config()
+    model_source, model_variant = _resolve_model_source_and_variant(config)
+    model_quality = cast(
+        ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
+    )
     resolved_defaults = resolve_conversion_defaults(
         config,
         {
@@ -1389,6 +1443,11 @@ def read(  # noqa: C901
     effective_pause_mode = (
         pause_mode if pause_mode is not None else config.get("pause_mode", "auto")
     )
+    effective_enable_short_sentence = (
+        enable_short_sentence
+        if enable_short_sentence is not None
+        else config.get("enable_short_sentence", None)
+    )
     # Get language code for TTS
     espeak_lang = LANG_CODE_TO_ONNX.get(effective_language, "en-us")
@@ -1645,11 +1704,15 @@ def read(  # noqa: C901
             model_path=model_path,
             voices_path=voices_path,
             use_gpu=effective_use_gpu,
+            model_quality=model_quality,
+            model_source=model_source,
+            model_variant=model_variant,
         )
         generation = GenerationConfig(
             speed=effective_speed,
             lang=espeak_lang,
             pause_mode=cast(Literal["tts", "manual", "auto"], effective_pause_mode),
+            enable_short_sentence=effective_enable_short_sentence,
             pause_clause=effective_pause_clause,
             pause_sentence=effective_pause_sentence,
             pause_paragraph=effective_pause_paragraph,
@@ -1658,6 +1721,9 @@ def read(  # noqa: C901
         pipeline_config = PipelineConfig(
             voice=effective_voice,
             generation=generation,
+            model_quality=model_quality,
+            model_source=model_source,
+            model_variant=model_variant,
             model_path=model_path,
             voices_path=voices_path,
         )

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/commands_phonemes.py RENAMED Viewed

@@ -10,9 +10,10 @@ This module contains commands for working with phonemes and pre-tokenized conten
 import re
 import sys
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 import click
+from pykokoro.onnx_backend import DEFAULT_MODEL_QUALITY, ModelQuality
 from rich.progress import (
     BarColumn,
     Progress,
@@ -37,6 +38,7 @@ from ..utils import (
     format_filename_template,
     load_config,
 )
+from .commands_utility import _resolve_model_source_and_variant
 from .helpers import console, parse_voice_parameter
@@ -500,6 +502,10 @@ def phonemes_convert(
     config = load_config()
     model_path = ctx.obj.get("model_path") if ctx.obj else None
     voices_path = ctx.obj.get("voices_path") if ctx.obj else None
+    model_source, model_variant = _resolve_model_source_and_variant(config)
+    model_quality = cast(
+        ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
+    )
     # Get book info and metadata
     book_info = book.get_info()
@@ -599,6 +605,9 @@ def phonemes_convert(
         speed=speed,
         output_format=fmt,
         use_gpu=gpu,
+        model_quality=model_quality,
+        model_source=model_source,
+        model_variant=model_variant,
         silence_between_chapters=silence,
         pause_clause=(
             pause_clause
@@ -834,6 +843,12 @@ def phonemes_preview(
             # Auto-detect if voice is a blend
             parsed_voice, parsed_voice_blend = parse_voice_parameter(voice)
+            config = load_config()
+            model_source, model_variant = _resolve_model_source_and_variant(config)
+            model_quality = cast(
+                ModelQuality, config.get("model_quality", DEFAULT_MODEL_QUALITY)
+            )
             # Initialize converter
             options = ConversionOptions(
                 phoneme_dictionary_path=str(phoneme_dict) if phoneme_dict else None,
@@ -841,6 +856,9 @@ def phonemes_preview(
                 voice_blend=parsed_voice_blend,
                 language=language,
                 output_format="wav",  # Explicitly set WAV format
+                model_quality=model_quality,
+                model_source=model_source,
+                model_variant=model_variant,
             )
             converter = TTSConverter(options)

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/commands_utility.py RENAMED Viewed

@@ -555,6 +555,14 @@ def _resolve_model_source_and_variant(cfg: dict) -> tuple[ModelSource, ModelVari
     return cast(ModelSource, source), cast(ModelVariant, variant)
+def _resolve_voice_names(
+    model_source: ModelSource = "huggingface",
+    model_variant: ModelVariant = "v1.0",
+) -> list[str]:
+    """Return the list of voice names for the given model variant."""
+    return VOICE_NAMES_BY_VARIANT.get(model_variant, VOICE_NAMES)
 def _get_cache_voices_path(
     model_source: ModelSource,
     model_variant: ModelVariant,
@@ -708,7 +716,7 @@ def download(ctx: click.Context, force: bool, quality: str | None) -> None:
         # ---- voices
         if model_source == "huggingface":
-            voice_names = VOICE_NAMES_BY_VARIANT.get(model_variant, VOICE_NAMES)
+            voice_names = _resolve_voice_names(model_source, model_variant)
             total_voices = len(voice_names)
             voices_task = progress.add_task(
                 f"Downloading voices (0/{total_voices})...", total=total_voices
@@ -1269,6 +1277,12 @@ def list_names(  # noqa: C901
         )
         console.print("[dim]Type 'q' to quit, 's' to skip, 'r' to replay.[/dim]\n")
+        cfg = load_config()
+        model_source, model_variant = _resolve_model_source_and_variant(cfg)
+        model_quality = cast(
+            ModelQuality, cfg.get("model_quality", DEFAULT_MODEL_QUALITY)
+        )
         # Initialize converter with phoneme dictionary
         try:
             # Auto-detect if voice is a blend
@@ -1279,6 +1293,9 @@ def list_names(  # noqa: C901
                 voice=parsed_voice or "af_sky",
                 voice_blend=parsed_voice_blend,
                 language=language,
+                model_quality=model_quality,
+                model_source=model_source,
+                model_variant=model_variant,
             )
             converter = TTSConverter(options)

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/cli/helpers.py RENAMED Viewed

@@ -50,6 +50,7 @@ DEFAULT_SAMPLE_TEXT = (
 DEMO_TEXT = {
     "a": "Hello! This audio was generated by {voice}. How do you like it?",
     "b": "Hello! This audio was generated by {voice}. How do you like it?",
+    "d": "Hallo! Dieses Audio wurde von {voice} erzeugt. Wie gefallt es Ihnen?",
     "e": "Hola! Este audio fue generado por {voice}. Que te parece?",
     "f": "Bonjour! Cet audio a ete genere par {voice}. Comment le trouvez-vous?",
     "h": "Namaste! Yah audio {voice} dwara banaya gaya hai. Aapko kaisa laga?",

{ttsforge-0.1.1 → ttsforge-0.1.2}/ttsforge/constants.py RENAMED Viewed

@@ -3,10 +3,10 @@
 # from pykokoro.onnx_backend import VOICE_NAMES_V1_0
 # from pykokoro.onnx_backend import VOICE_NAMES_V1_1_ZH, VOICE_NAMES_V1_1_DE
-from pykokoro.onnx_backend import VOICE_NAMES_V1_0 as VOICE_NAMES
+from pykokoro.onnx_backend import DEFAULT_MODEL_SOURCE, VOICE_NAMES_V1_0
 # Re-export from pykokoro for convenience
-VOICES = VOICE_NAMES
+VOICES = VOICE_NAMES_V1_0
 # Audio constants from pykokoro
 try:
@@ -24,6 +24,7 @@ PROGRAM_DESCRIPTION = "Generate audiobooks from EPUB files using Kokoro ONNX TTS
 LANGUAGE_DESCRIPTIONS = {
     "a": "American English",
     "b": "British English",
+    "d": "German",
     "e": "Spanish",
     "f": "French",
     "h": "Hindi",
@@ -35,6 +36,8 @@ LANGUAGE_DESCRIPTIONS = {
 # ISO language code to ttsforge language code mapping
 ISO_TO_LANG_CODE = {
+    "de": "d",
+    "de-de": "d",
     "en": "a",  # Default to American English
     "en-us": "a",
     "en-gb": "b",
@@ -62,6 +65,8 @@ VOICE_PREFIX_TO_LANG = {
     "am": "a",  # American Male
     "bf": "b",  # British Female
     "bm": "b",  # British Male
+    "df": "d",  # German Female
+    "dm": "d",  # German Male
     "ef": "e",  # Spanish Female
     "em": "e",  # Spanish Male
     "ff": "f",  # French Female
@@ -82,6 +87,7 @@ VOICE_PREFIX_TO_LANG = {
 DEFAULT_VOICE_FOR_LANG = {
     "a": "af_heart",
     "b": "bf_emma",
+    "d": "df_eva",
     "e": "ef_dora",
     "f": "ff_siwis",
     "h": "hf_alpha",
@@ -115,6 +121,7 @@ DEFAULT_CONFIG = {
     "use_gpu": False,  # GPU requires onnxruntime-gpu
     # Model quality: fp32, fp16, q8, q8f16, q4, q4f16, uint8, uint8f16
     "model_quality": "fp32",
+    "model_source": DEFAULT_MODEL_SOURCE,
     "model_variant": "v1.0",
     "silence_between_chapters": 2.0,
     "save_chapters_separately": False,
@@ -128,6 +135,7 @@ DEFAULT_CONFIG = {
     "pause_paragraph": 0.9,
     "pause_variance": 0.05,
     "pause_mode": "auto",  # "tts", "manual", or "auto
+    "enable_short_sentence": None,
     # Language override for phonemization (e.g., 'de', 'fr', 'en-us')
     # If None, language is determined from voice prefix
     "phonemization_lang": None,
@@ -154,6 +162,7 @@ AUDIO_CHANNELS = 1
 SAMPLE_TEXTS = {
     "a": "This is a sample of the selected voice.",
     "b": "This is a sample of the selected voice.",
+    "d": "Dies ist ein Beispiel für die ausgewählte Stimme.",
     "e": "Este es una muestra de la voz seleccionada.",
     "f": "Ceci est un exemple de la voix sélectionnée.",
     "h": "यह चयनित आवाज़ का एक नमूना है।",  # noqa: E501

ttsforge 0.1.1__tar.gz → 0.1.2__tar.gz

ttsforge 0.1.1tar.gz → 0.1.2tar.gz