PyPI - sinapsis-speech - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sinapsis-speech 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+from typing import Literal
+from pydantic.dataclasses import dataclass
+kokoro_voices = Literal[
+    "af_heart",
+    "af_alloy",
+    "af_aoede",
+    "af_bella",
+    "af_jessicaaf_kore",
+    "af_nicole",
+    "af_nova",
+    "af_river",
+    "af_sarah",
+    "af_sky",
+    "am_adam",
+    "am_echo",
+    "am_eric",
+    "am_fenrir",
+    "am_liam",
+    "am_michael",
+    "am_onyx",
+    "am_puck",
+    "am_santa",
+    "bf_alice",
+    "bf_emma",
+    "bf_isabella",
+    "bf_lily",
+    "bm_daniel",
+    "bm_fable",
+    "bm_george",
+    "bm_lewis",
+    "jf_alpha",
+    "jf_gongitsune",
+    "jf_nezumi",
+    "jf_tebukuro",
+    "jm_kumo",
+    "zf_xiaobei",
+    "zf_xiaoni",
+    "zf_xiaoxiao",
+    "zf_xiaoyi",
+    "zm_yunjian",
+    "zm_yunxi",
+    "zm_yunxia",
+    "zm_yunyang",
+    "ef_dora",
+    "em_alex",
+    "em_santa",
+    "ff_siwis",
+    "hf_alpha",
+    "hf_beta",
+    "hm_omega",
+    "hm_psi",
+    "if_sara",
+    "im_nicola",
+    "pf_dora",
+    "pm_alex",
+    "pm_santa",
+]
+@dataclass(frozen=True)
+class KokoroKeys:
+    """
+    A class to hold constants for the keys used in the Text-to-Speech (TTS) model configuration.
+    These keys represent standard fields that are used to configure various parameters of the TTS model,
+    such as speaker attributes, emotions, and other audio-related settings. They are typically used in
+    templates and potentially a TTS web application to adjust and access specific TTS settings."
+    """
+    repo_id: Literal["hexgrad/Kokoro-82M"] = "hexgrad/Kokoro-82M"
+    default_voice: Literal["af_heart"] = "af_heart"

sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+import importlib
+from typing import Callable
+_root_lib_path = "sinapsis_kokoro.templates"
+_template_lookup = {
+    "KokoroTTS": f"{_root_lib_path}.kokoro_tts",
+}
+def __getattr__(name: str) -> Callable:
+    if name in _template_lookup:
+        module = importlib.import_module(_template_lookup[name])
+        return getattr(module, name)
+    raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
+__all__ = list(_template_lookup.keys())

sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py ADDED Viewed

@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+from typing import Generator
+from urllib.error import HTTPError
+import torch
+from kokoro import KPipeline
+from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
+from sinapsis_core.template_base.template import (
+    Template,
+    TemplateAttributes,
+    TemplateAttributeType,
+)
+from sinapsis_core.utils.logging_utils import make_loguru
+from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
+class KokoroTTS(Template):
+    """
+    Template for text-to-speech (TTS) synthesis using the Kokoro 82M v1.0 model.
+    This class handles the initialization of the TTS pipeline, speech generation,
+    and packaging the output audio in the desired format.
+    Usage example:
+    agent:
+      name: my_test_agent
+    templates:
+    - template_name: InputTemplate
+      class_name: InputTemplate
+      attributes: {}
+    - template_name: KokoroTTS
+      class_name: KokoroTTS
+      template_input: InputTemplate
+      attributes:
+        speed: 1
+        voice: af_heart
+    """
+    class AttributesBaseModel(TemplateAttributes):
+        """
+        Configuration attributes for the Kokoro TTS model.
+        Args:
+            speed (int | float): The speed at which the speech will be generated. Default is 1 (normal speed).
+            split_pattern (str): The regular expression pattern used to split the input text into smaller chunks.
+                Default is r"\n+" (split on newlines).
+            voice (kokoro_voices): The voice model to use for speech synthesis. Default is "af_heart".
+        Notes:
+            The list of languages and voices supported by Kokoro can be found at:
+            https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
+        """
+        speed: int | float = 1
+        split_pattern: str = r"\n+"
+        voice: kokoro_voices = KokoroKeys.default_voice
+    def __init__(self, attributes: TemplateAttributeType) -> None:
+        """Initializes the Kokoro TTS pipeline with the provided attributes."""
+        super().__init__(attributes)
+        self.pipeline = self.init_pipeline()
+        self.logger = make_loguru()
+    def init_pipeline(self) -> KPipeline:
+        """
+        Initializes the Kokoro TTS pipeline with the voice model and repository id.
+        Returns:
+            KPipeline: The initialized TTS pipeline for generating speech.
+        """
+        return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
+    def _create_audio_packet(
+        self,
+        audio_data: torch.tensor,
+        sample_rate: int,
+        container: DataContainer,
+    ) -> None:
+        """
+        Creates an audio packet from the generated audio data and adds it to the container.
+        Args:
+            audio_data (torch.tensor): The generated audio data (raw audio).
+            sample_rate (int): The sample rate of the generated audio (typically 24000 Hz).
+            container (DataContainer): The container to which the audio packet will be added.
+        """
+        audio_packet = AudioPacket(
+            content=audio_data,
+            source=self.instance_name,
+            sample_rate=sample_rate,
+        )
+        container.audios.append(audio_packet)
+    def _process_audio_chunks(self, generator: Generator, container: DataContainer) -> None:
+        """
+        Processes the audio chunks generated by the pipeline and creates audio packets.
+        Args:
+            generator: The generator that yields text, phonemes, and audio data.
+            container (DataContainer): The container holding the input data.
+        """
+        for i, (gs, ps, audio) in enumerate(generator):
+            self.logger.debug(f"Index: {i}")
+            self.logger.debug(f"Text: {gs}")
+            self.logger.debug(f"Phonemes: {ps}")
+            if audio is not None:
+                self._create_audio_packet(audio, 24000, container)
+            else:
+                self.logger.warning(f"Audio is None for index {i}")
+    def generate_speech(self, container: DataContainer) -> None:
+        """
+        Generates speech from the input text in the provided data container.
+        Args:
+            container (DataContainer): The container holding the input text data to be converted into speech.
+        """
+        input_text = "".join(t.content for t in container.texts)
+        generator = self.pipeline(
+            input_text,
+            voice=self.attributes.voice,
+            speed=self.attributes.speed,
+            split_pattern=self.attributes.split_pattern,
+        )
+        try:
+            self._process_audio_chunks(generator, container)
+        except HTTPError as e:
+            self.logger.error(f"Unable to generate speech: {e}")
+    def execute(self, container: DataContainer) -> DataContainer:
+        """
+        Processes the input data and generates the corresponding speech output.
+        Args:
+            container (DataContainer): The container holding the input text data.
+        Returns:
+            DataContainer: The updated container with the generated audio.
+        """
+        if not container.texts:
+            self.logger.debug("No query to enter")
+            return container
+        self.generate_speech(container)
+        return container

{sinapsis_speech-0.2.2.dist-info → sinapsis_speech-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sinapsis-speech
-Version: 0.2.2
+Version: 0.3.0
 Summary: Generate speech using various libraries.
 Author-email: SinapsisAI <dev@sinapsis-ai.com>
 Project-URL: Homepage, https://sinapsis.tech
@@ -15,6 +15,7 @@ Requires-Dist: sinapsis>=0.2.2
 Provides-Extra: all
 Requires-Dist: sinapsis-elevenlabs[all]; extra == "all"
 Requires-Dist: sinapsis-f5-tts[all]; extra == "all"
+Requires-Dist: sinapsis-kokoro[all]; extra == "all"
 Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
 Requires-Dist: sinapsis-zonos[all]; extra == "all"
 Provides-Extra: gradio-app
@@ -54,6 +55,7 @@ This repo includes packages for performing speech synthesis using different tool
 * <code>sinapsis-elevenlabs</code>
 * <code>sinapsis-f5-tts</code>
+* * <code>sinapsis-kokoro</code>
 * <code>sinapsis-zonos</code>
 Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
@@ -123,7 +125,14 @@ This package provides a template for seamlessly integrating, configuring, and ru
 For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_f5_tts/README.md).
 </details>
+<details>
+<summary id="f5tts"><strong><span style="font-size: 1.4em;"> Sinapsis Kokoro</span></strong></summary>
+This package provides a single template for integrating, configuring, and running text-to-speech (TTS) synthesis using the [Kokoro 82M v1.0](https://huggingface.co/hexgrad/Kokoro-82M) model.
+KokoroTTS: Converts text to speech using the Kokoro TTS model. The template processes text packets from the input container, generates corresponding audio using Kokoro, and adds the resulting audio packets to the container.
+For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_kokoro/README.md).
+</details>
 <details>
 <summary id="zonos"><strong><span style="font-size: 1.4em;"> Sinapsis Zonos</span></strong></summary>
@@ -162,41 +171,56 @@ cd sinapsis-speech
 > [!NOTE]
 > Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
 <details>
 <summary id="docker"><strong><span style="font-size: 1.4em;">🐳 Docker</span></strong></summary>
 **IMPORTANT**: This Docker image depends on the `sinapsis-nvidia:base` image. For detailed instructions, please refer to the [Sinapsis README](https://github.com/Sinapsis-ai/sinapsis?tab=readme-ov-file#docker).
 1. **Build the sinapsis-speech image**:
 ```bash
 docker compose -f docker/compose.yaml build
 ```
 2. **Start the app container**:
-For ElevenLabs:
+- For ElevenLabs:
 ```bash
 docker compose -f docker/compose_apps.yaml up -d sinapsis-elevenlabs
 ```
-For F5-TTS:
+- For F5-TTS:
 ```bash
 docker compose -f docker/compose_apps.yaml up -d sinapsis-f5_tts
 ```
-For Zonos:
+- For Kokoro:
+```bash
+docker compose -f docker/compose_apps.yaml up -d sinapsis-kokoro
+```
+- For Zonos:
 ```bash
 docker compose -f docker/compose_apps.yaml up -d sinapsis-zonos
 ```
 3. **Check the logs**
-For ElevenLabs:
+- For ElevenLabs:
 ```bash
 docker logs -f sinapsis-elevenlabs
 ```
-For F5-TTS:
+- For F5-TTS:
 ```bash
 docker logs -f sinapsis-f5tts
 ```
-For Zonos:
+- For Kokoro:
+```bash
+docker logs -f sinapsis-kokoro
+```
+- For Zonos:
 ```bash
 docker logs -f sinapsis-zonos
 ```
@@ -227,18 +251,26 @@ uv sync --frozen
 uv pip install sinapsis-speech[all] --extra-index-url https://pypi.sinapsis.tech
 ```
 3. **Run the webapp**:
-For ElevenLabs:
+- For ElevenLabs:
 ```bash
-uv run webapps/elevenlabs/elevenlabs_tts_app.py
+uv run webapps/generic_tts_apps/elevenlabs_tts_app.py
 ```
-For F5-TTS:
+- For F5-TTS:
+```bash
+uv run webapps/packet_tts_apps/f5_tts_app.py
+```
+- For Kokoro:
 ```bash
-uv run webapps/f5-tts/f5_tts_app.py
+uv run webapps/packet_tts_apps/kokoro_tts_app.py
 ```
-For Zonos:
+- For Zonos:
 ```bash
-uv run webapps/zonos/zonos_tts_app.py
+uv run webapps/generic_tts_apps/zonos_tts_app.py
 ```
 4. **The terminal will display the URL to access the webapp (e.g.)**:
 ```bash

{sinapsis_speech-0.2.2.dist-info → sinapsis_speech-0.3.0.dist-info}/RECORD RENAMED Viewed

@@ -9,14 +9,17 @@ sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generatio
 sinapsis_f5_tts/src/sinapsis_f5_tts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_f5_tts/src/sinapsis_f5_tts/templates/__init__.py,sha256=28BOPAr9GG1jYcrXi45ZWO1n2FAZJOdDcmRkOXdEYmk,496
 sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py,sha256=7EBxw-tRthbPDz0zFopaLdBhv7DXwxyMGXam6F1MwGs,15802
-sinapsis_speech-0.2.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+sinapsis_kokoro/src/sinapsis_kokoro/helpers/kokoro_utils.py,sha256=2IMJuwURPKK7keIkgS-rpGD28REG5M1FwW0COGcm3nI,1573
+sinapsis_kokoro/src/sinapsis_kokoro/templates/__init__.py,sha256=aX25GCUNGzIBeY5kifomsB-nSzW-unfq0-aC2Rpnaws,485
+sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py,sha256=17fAmVD-uLaM6zZHdBXjLcKEJbe5s0uDV9IYtmjC57Q,5259
+sinapsis_speech-0.3.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
 sinapsis_zonos/src/sinapsis_zonos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py,sha256=m1GdOYfzP73JGmtxH30mNiqbNkzFsQl9o2QaT7QxSVU,2470
 sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py,sha256=8Tr2YgxjBfRqv_Hf6sw36X2pLzW7fdQWqa6QPBxNZK8,6419
 sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py,sha256=A-_F0K3hbEFqeWWAh4YftgU9CFX-WHrauSiCAww9yp8,482
 sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py,sha256=KsNuT8cFTTjTEqjfEWsIr4B-DjGhVacSw2SdPckuFvk,7507
-sinapsis_speech-0.2.2.dist-info/METADATA,sha256=dHZvwWrOxQAlvOYlqM96pazsQfT-Byw_EVDAU0innXc,8968
-sinapsis_speech-0.2.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
-sinapsis_speech-0.2.2.dist-info/top_level.txt,sha256=vQFjL84TMSRld2lKvEVMUNyY2b3AVluCT1Ijws7o7_c,51
-sinapsis_speech-0.2.2.dist-info/RECORD,,
+sinapsis_speech-0.3.0.dist-info/METADATA,sha256=9fQtDUnhPIesfZg-FF8Rk6074yGwG0WHZDgNjrsGa24,10032
+sinapsis_speech-0.3.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+sinapsis_speech-0.3.0.dist-info/top_level.txt,sha256=dd-bGAKXxelJCHcNxFZM4OTJ2mylgM2astOGPpj91yo,67
+sinapsis_speech-0.3.0.dist-info/RECORD,,

{sinapsis_speech-0.2.2.dist-info → sinapsis_speech-0.3.0.dist-info}/top_level.txt RENAMED Viewed

@@ -1,3 +1,4 @@
 sinapsis_elevenlabs
 sinapsis_f5_tts
+sinapsis_kokoro
 sinapsis_zonos

{sinapsis_speech-0.2.2.dist-info → sinapsis_speech-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sinapsis_speech-0.2.2.dist-info → sinapsis_speech-0.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sinapsis-speech 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

sinapsis-speech 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl