PyPI - sinapsis-speech - Versions diffs - 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

sinapsis-speech 0.4.5py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

sinapsis_csm/__init__.py ADDED Viewed

File without changes

sinapsis_csm/src/sinapsis_csm/__init__.py ADDED Viewed

File without changes

sinapsis_csm/src/sinapsis_csm/helpers/generator.py ADDED Viewed

@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+from typing import Literal
+import torch
+from csm.generator import Generator
+from csm.models import Model
+class CSMGenerator:
+    """
+    Wrapper around the CSM model providing a simple interface
+    for text-to-speech generation
+    """
+    def __init__(self, device: Literal["cpu", "cuda"] = "cpu", sample_rate: int = 24000) -> None:
+        self.device: str = device
+        self.sample_rate: int = sample_rate
+        self.model: Model = Model.from_pretrained("sesame/csm-1b")
+        self.model.to(device=device)
+        self.model.sample_rate = sample_rate
+        self.generator = Generator(self.model)
+    def generate(
+        self, text: str, speaker: int = 0, context: list[str] | None = None, max_audio_length_ms: int = 10000
+    ) -> torch.Tensor:
+        if context is None:
+            context = []
+        return self.generator.generate(
+            text=text,
+            speaker=speaker,
+            context=context,
+            max_audio_length_ms=max_audio_length_ms,
+        )
+def load_csm_1b(device: Literal["cpu", "cuda"] = "cpu", sample_rate: int = 24000) -> CSMGenerator:
+    """
+    Loads and configures the CSM TTS model.
+    Returns:
+        CSMGenerator: Model wrapper with ready-to-use generate method.
+    """
+    return CSMGenerator(device=device, sample_rate=sample_rate)

sinapsis_csm/src/sinapsis_csm/templates/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+import importlib
+from typing import Callable
+from sinapsis_csm.templates.csm_tts import CSMTTS
+_root_lib_path = "sinapsis_csm.templates"
+_template_lookup = {
+    "CSMTTS": f"{_root_lib_path}.csm_tts",
+}
+def __getattr__(name: str) -> Callable:
+    if name in _template_lookup:
+        module = importlib.import_module(_template_lookup[name])
+        return getattr(module, name)
+    raise AttributeError(f"Template `{name}` not found in `{_root_lib_path}`.")
+__all__ = ["CSMTTS"]

sinapsis_csm/src/sinapsis_csm/templates/csm_tts.py ADDED Viewed

@@ -0,0 +1,88 @@
+from typing import Literal
+import torch
+from sinapsis_core.data_containers.data_packet import AudioPacket, DataContainer
+from sinapsis_core.template_base import Template
+from sinapsis_core.template_base.base_models import TemplateAttributes, TemplateAttributeType
+from sinapsis_csm.helpers.generator import load_csm_1b
+class CSMTTS(Template):
+    """
+    Sinapsis template for converting text into speech using the CSM TTS model.
+    """
+    class AttributesBaseModel(TemplateAttributes):  # type: ignore
+        """
+        Defines configurable attributes for the CSMTTS template.
+        """
+        speaker_id: int = 0
+        max_audio_length_ms: int = 10000
+        device: Literal["cuda", "cpu"] = "cpu"
+        context: list[str] | None = None
+        sample_rate_hz: int = 24000
+    def __init__(self, attributes: TemplateAttributeType) -> None:
+        """
+        Initializes the template and loads the CSM model.
+        Args:
+            attributes (TemplateAttributeType): User-defined attributes from YAML configuration.
+        """
+        super().__init__(attributes)
+        self.model = load_csm_1b(
+            device=self.attributes.device,
+            sample_rate=self.attributes.sample_rate_hz
+        )
+    def generate_audio(self, text: str) -> torch.Tensor:
+        """
+        Converts input text to audio using the CSM model.
+        Args:
+            text (str): Input text string.
+        Returns:
+            torch.Tensor: Audio waveform tensor.
+        """
+        context = self.attributes.context if self.attributes.context else []
+        return self.model.generate(
+            text=text,
+            speaker=self.attributes.speaker_id,
+            context=context,
+            max_audio_length_ms=self.attributes.max_audio_length_ms,
+        )
+    def generate_audio_packet(self, audio: torch.Tensor, source_text: str) -> AudioPacket:
+        """
+        Wraps a raw audio tensor into a sinapsis compatible audioPacket
+        Args:
+            audio (torch.Tensor): Audio waveform.
+            source_text (str): Original input text used for generation.
+        Returns:
+            AudioPacket: Encapsulated audio data with metadata.
+        """
+        audio_np = audio.cpu().numpy()
+        return AudioPacket(
+            content=audio_np,
+            sample_rate=self.attributes.sample_rate_hz,
+            generic_data={"source_text": source_text, "model": "CSM"}
+        )
+    def execute(self, container: DataContainer) -> DataContainer:
+        """
+        Main method executed by Sinapsis. Converts all text packets in the input container to audio.
+        Args:
+            container (DataContainer): Input container with text packets.
+        Returns:
+            DataContainer: Output container with generated audio packets.
+        """
+        for packet in container.texts:
+            audio = self.generate_audio(packet.content)
+            audio_packet = self.generate_audio_packet(audio, packet.content)
+            audio_packet.source = self.instance_name
+            container.audios.append(audio_packet)
+        return container

{sinapsis_speech-0.4.5.dist-info → sinapsis_speech-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sinapsis-speech
-Version: 0.4.5
+Version: 0.5.0
 Summary: Generate speech using various libraries.
 Author-email: SinapsisAI <dev@sinapsis.tech>
 Project-URL: Homepage, https://sinapsis.tech
@@ -20,6 +20,7 @@ Requires-Dist: sinapsis-speech[gradio-app]; extra == "all"
 Requires-Dist: sinapsis-zonos[all]; extra == "all"
 Requires-Dist: sinapsis-parakeet-tdt[all]; extra == "all"
 Requires-Dist: sinapsis-orpheus-cpp[all]; extra == "all"
+Requires-Dist: sinapsis-csm[all]; extra == "all"
 Provides-Extra: gradio-app
 Requires-Dist: sinapsis[webapp]>=0.2.3; extra == "gradio-app"
 Dynamic: license-file
@@ -61,6 +62,7 @@ This repo includes packages for performing speech synthesis using different tool
 * <code>sinapsis-zonos</code>
 * <code>sinapsis-orpheus-cpp</code>
 * <code>sinapsis-parakeet</code>
+* <code>sinapsis-csm</code>
 Install using your preferred package manager. We strongly recommend using <code>uv</code>. To install <code>uv</code>, refer to the [official documentation](https://docs.astral.sh/uv/getting-started/installation/#installation-methods).
@@ -176,6 +178,17 @@ For specific instructions and further details, see the [README.md](https://githu
 </details>
+<details>
+<summary id="parakeet-tdt"><strong><span style="font-size: 1.4em;"> Sinapsis Parakeet-TDT</span></strong></summary>
+This package provides a template for seamlessly integrating, configuring, and running **speech-to-text (STT)** functionalities powered by [SesameAILabs CSM](https://github.com/SesameAILabs/csm/tree/main?tab=readme-ov-file).
+- **CSMTTS**: Converts text into speech using the CSM model. This template processes text packets from the input container and adds the resulting audio packets to the container.
+For specific instructions and further details, see the [README.md](https://github.com/Sinapsis-AI/sinapsis-speech/blob/main/packages/sinapsis_csm/README.md).
+</details>
 <h2 id="webapp">🌐 Webapps</h2>
 The webapps included in this project showcase the modularity of the templates, in this case for speech generation tasks.
@@ -200,6 +213,9 @@ cd sinapsis-speech
 > [!IMPORTANT]
 > F5-TTS requires a reference audio file for voice cloning. Make sure you have a reference audio file in the artifacts directory.
+> [!IMPORTANT]
+> CSM requires an HF_TOKEN to run any inference. See the [official instructions](https://huggingface.co/docs/hub/security-tokens) and set it using <code>export HF_TOKEN="token-provided-by-hf"</code>
 > [!NOTE]
 > Agent configuration can be changed through the `AGENT_CONFIG_PATH` env var. You can check the available configurations in each package configs folder.
@@ -246,6 +262,11 @@ docker compose -f docker/compose_apps.yaml up -d sinapsis-orpheus-tts
 docker compose -f docker/compose_apps.yaml up -d sinapsis-parakeet
 ```
+- For CSM:
+```bash
+docker compose -f docker/compose_apps.yaml up -d sinapsis-csm
+```
 3. **Check the logs**
 - For ElevenLabs:
@@ -276,6 +297,11 @@ docker logs -f sinapsis-orpheus-tts
 docker logs -f sinapsis-parakeet
 ```
+- For CSM:
+```bash
+docker logs -f sinapsis-csm
+```
 4. **The logs will display the URL to access the webapp, e.g.,:**:
 ```bash
 Running on local URL:  http://127.0.0.1:7860
@@ -335,6 +361,12 @@ uv run webapps/packet_tts_apps/kokoro_tts_app.py
 ```bash
 uv run webapps/generic_tts_apps/zonos_tts_app.py
 ```
+- For CSM:
+```bash
+uv run webapps/generic_tts_apps/csm_tts_app.py
+```
 4. **The terminal will display the URL to access the webapp (e.g.)**:
 ```bash
 Running on local URL:  http://127.0.0.1:7860

{sinapsis_speech-0.4.5.dist-info → sinapsis_speech-0.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,3 +1,8 @@
+sinapsis_csm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sinapsis_csm/src/sinapsis_csm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sinapsis_csm/src/sinapsis_csm/helpers/generator.py,sha256=YQkZOHqc6bHiriasRJpPNElTdV2W3r5egiSZtRTCEAs,1336
+sinapsis_csm/src/sinapsis_csm/templates/__init__.py,sha256=ByEZu3rcqETfkVG0He91bTDzGWWUdY-Zn5b4i2MLHc4,485
+sinapsis_csm/src/sinapsis_csm/templates/csm_tts.py,sha256=s4zj_QzSR9FehlPvz_3oLA9WLyoI4LYHjhiHb4-HfW0,3101
 sinapsis_elevenlabs/src/sinapsis_elevenlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/env_var_keys.py,sha256=j8J64iplBNaff1WvmfJ03eJozE1f5SdqtqQeldV2vPY,998
@@ -30,7 +35,7 @@ sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py,sha256=47DEQ
 sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py,sha256=OKZbq4zIL6XWM7eG5WuQ3dWYkmYNWjuCnlseXmjR_j0,262
 sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py,sha256=3LppgbS6v70Rmx__yXXQgnoZ2ZBHcXkXeWZYQQf6Zwg,504
 sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py,sha256=Tw9S8Nqf74lXwUxBodaLK_JaQvh9ITt8cWFQJ2QNP6s,10210
-sinapsis_speech-0.4.5.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+sinapsis_speech-0.5.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
 sinapsis_zonos/src/sinapsis_zonos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_zonos/src/sinapsis_zonos/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py,sha256=Y7MKQNx1E2k7ebF6r_1l1nBeS5k8hO424yFTT9NI7Rg,244
@@ -38,7 +43,7 @@ sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_keys.py,sha256=m1GdOYfzP73JGmtxH
 sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py,sha256=bwu88wsJGzEqbssgb-wpS_7lFscJ74J8cgyca-hX_Qw,6422
 sinapsis_zonos/src/sinapsis_zonos/templates/__init__.py,sha256=A-_F0K3hbEFqeWWAh4YftgU9CFX-WHrauSiCAww9yp8,482
 sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py,sha256=h5EToXoJgAgjqvz9WLDfSjhCsV5zgBwZrX5cTJ4VnhM,7679
-sinapsis_speech-0.4.5.dist-info/METADATA,sha256=ZBGpQgEu2_I7DDsO_t2MO690zMA0OtncYXMUmGTA6-M,12783
-sinapsis_speech-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sinapsis_speech-0.4.5.dist-info/top_level.txt,sha256=KvdwXupt5wnqb_4XGRcuJaL9Glgdw-DBvRkNzhgl_Ds,110
-sinapsis_speech-0.4.5.dist-info/RECORD,,
+sinapsis_speech-0.5.0.dist-info/METADATA,sha256=4GliLgI5CoPUwqQnYPnSdxDGI30WAciSCRNSpjNO9FQ,13987
+sinapsis_speech-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+sinapsis_speech-0.5.0.dist-info/top_level.txt,sha256=V3zOrj7E7CvmLsN7sNeISyc_yJPwKaw_V-msOpmvK30,123
+sinapsis_speech-0.5.0.dist-info/RECORD,,

{sinapsis_speech-0.4.5.dist-info → sinapsis_speech-0.5.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{sinapsis_speech-0.4.5.dist-info → sinapsis_speech-0.5.0.dist-info}/top_level.txt RENAMED Viewed

@@ -1,3 +1,4 @@
+sinapsis_csm
 sinapsis_elevenlabs
 sinapsis_f5_tts
 sinapsis_kokoro

{sinapsis_speech-0.4.5.dist-info → sinapsis_speech-0.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sinapsis-speech 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

sinapsis-speech 0.4.5py3-none-any.whl → 0.5.0py3-none-any.whl