PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev832__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev830py3-none-any.whl → 0.0.85.dev832py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dv-pipecat-ai
-Version: 0.0.85.dev830
+Version: 0.0.85.dev832
 Summary: An open source framework for voice (and multimodal) assistants
 License-Expression: BSD-2-Clause
 Project-URL: Source, https://github.com/pipecat-ai/pipecat

{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dv_pipecat_ai-0.0.85.dev830.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
+dv_pipecat_ai-0.0.85.dev832.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
 pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
 pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -205,7 +205,7 @@ pipecat/services/azure/realtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
 pipecat/services/azure/realtime/llm.py,sha256=MnDiw-YJP3kll1gbkta4z4vsWfWZ5oBprZCinMP9O0M,2385
 pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
 pipecat/services/cartesia/stt.py,sha256=00k9gQYo_xPKb-RRJ-RNV4LPFw-7xXiFU7ACFLYttWY,12388
-pipecat/services/cartesia/tts.py,sha256=EdpVJoDhZn7N5hj-VDsCaO-W2MsA78UzOdrHR4G7w08,24355
+pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_obtc,27008
 pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
 pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
 pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
@@ -415,7 +415,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
 pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
 pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
 pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
-dv_pipecat_ai-0.0.85.dev830.dist-info/METADATA,sha256=wPJAPffJo_L5wKNWKbIxlaBG09JAGKUTFl_qkLwmoPw,32924
-dv_pipecat_ai-0.0.85.dev830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dv_pipecat_ai-0.0.85.dev830.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
-dv_pipecat_ai-0.0.85.dev830.dist-info/RECORD,,
+dv_pipecat_ai-0.0.85.dev832.dist-info/METADATA,sha256=LjkA2HTlz8IiiToSkqkqztGsCqkbhgEdL6B0BXdOOLA,32924
+dv_pipecat_ai-0.0.85.dev832.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dv_pipecat_ai-0.0.85.dev832.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
+dv_pipecat_ai-0.0.85.dev832.dist-info/RECORD,,

pipecat/services/cartesia/tts.py CHANGED Viewed

@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
 from loguru import logger
 from pydantic import BaseModel, Field
 from pipecat.frames.frames import (
     CancelFrame,
     EndFrame,
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
     raise Exception(f"Missing module: {e}")
+class GenerationConfig(BaseModel):
+    """Configuration for Cartesia Sonic-3 generation parameters.
+    Sonic-3 interprets these parameters as guidance to ensure natural speech.
+    Test against your content for best results.
+    Parameters:
+        volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
+        speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
+        emotion: Single emotion string to guide the emotional tone. Examples include neutral,
+            angry, excited, content, sad, scared. Over 60 emotions are supported. For best
+            results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
+            and Marian.
+    """
+    volume: Optional[float] = None
+    speed: Optional[float] = None
+    emotion: Optional[str] = None
 def language_to_cartesia_language(language: Language) -> Optional[str]:
     """Convert a Language enum to Cartesia language code.
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
         Language.SV: "sv",
         Language.TR: "tr",
         Language.ZH: "zh",
+        Language.TL: "tl",
+        Language.BG: "bg",
+        Language.RO: "ro",
+        Language.AR: "ar",
+        Language.CS: "cs",
+        Language.EL: "el",
+        Language.FI: "fi",
+        Language.HR: "hr",
+        Language.MS: "ms",
+        Language.SK: "sk",
+        Language.DA: "da",
+        Language.TA: "ta",
+        Language.UK: "uk",
+        Language.HU: "hu",
+        Language.NO: "no",
+        Language.VI: "vi",
+        Language.BN: "bn",
+        Language.TH: "th",
+        Language.HE: "he",
+        Language.KA: "ka",
+        Language.ID: "id",
+        Language.TE: "te",
+        Language.GU: "gu",
+        Language.KN: "kn",
+        Language.ML: "ml",
+        Language.MR: "mr",
+        Language.PA: "pa",
     }
     result = BASE_LANGUAGES.get(language)
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
         Parameters:
             language: Language to use for synthesis.
-            speed: Voice speed control.
-            emotion: List of emotion controls.
+            speed: Voice speed control for non-Sonic-3 models (literal values).
+            emotion: List of emotion controls for non-Sonic-3 models.
                 .. deprecated:: 0.0.68
                         The `emotion` parameter is deprecated and will be removed in a future version.
+            generation_config: Generation configuration for Sonic-3 models. Includes volume,
+                speed (numeric), and emotion (string) parameters.
         """
         language: Optional[Language] = Language.EN
         speed: Optional[Literal["slow", "normal", "fast"]] = None
         emotion: Optional[List[str]] = []
+        generation_config: Optional[GenerationConfig] = None
     def __init__(
         self,
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
         voice_id: str,
         cartesia_version: str = "2025-04-16",
         url: str = "wss://api.cartesia.ai/tts/websocket",
-        model: str = "sonic-2",
+        model: str = "sonic-3",
         sample_rate: Optional[int] = None,
         encoding: str = "pcm_s16le",
         container: str = "raw",
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
             voice_id: ID of the voice to use for synthesis.
             cartesia_version: API version string for Cartesia service.
             url: WebSocket URL for Cartesia TTS API.
-            model: TTS model to use (e.g., "sonic-2").
+            model: TTS model to use (e.g., "sonic-3").
             sample_rate: Audio sample rate. If None, uses default.
             encoding: Audio encoding format.
             container: Audio container format.
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
             else "en",
             "speed": params.speed,
             "emotion": params.emotion,
+            "generation_config": params.generation_config,
         }
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
         if self._settings["speed"]:
             msg["speed"] = self._settings["speed"]
+        if self._settings["generation_config"]:
+            msg["generation_config"] = self._settings["generation_config"].model_dump(
+                exclude_none=True
+            )
         return json.dumps(msg)
     async def start(self, frame: StartFrame):
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
                 logger.error(f"{self} error: {msg}")
                 await self.push_frame(TTSStoppedFrame())
                 await self.stop_all_metrics()
                 await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
                 self._context_id = None
             else:
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
         Parameters:
             language: Language to use for synthesis.
-            speed: Voice speed control.
-            emotion: List of emotion controls.
+            speed: Voice speed control for non-Sonic-3 models (literal values).
+            emotion: List of emotion controls for non-Sonic-3 models.
                 .. deprecated:: 0.0.68
                         The `emotion` parameter is deprecated and will be removed in a future version.
+            generation_config: Generation configuration for Sonic-3 models. Includes volume,
+                speed (numeric), and emotion (string) parameters.
         """
         language: Optional[Language] = Language.EN
         speed: Optional[Literal["slow", "normal", "fast"]] = None
         emotion: Optional[List[str]] = Field(default_factory=list)
+        generation_config: Optional[GenerationConfig] = None
     def __init__(
         self,
         *,
         api_key: str,
         voice_id: str,
-        model: str = "sonic-2",
+        model: str = "sonic-3",
         base_url: str = "https://api.cartesia.ai",
         cartesia_version: str = "2024-11-13",
         sample_rate: Optional[int] = None,
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
         Args:
             api_key: Cartesia API key for authentication.
             voice_id: ID of the voice to use for synthesis.
-            model: TTS model to use (e.g., "sonic-2").
+            model: TTS model to use (e.g., "sonic-3").
             base_url: Base URL for Cartesia HTTP API.
             cartesia_version: API version string for Cartesia service.
             sample_rate: Audio sample rate. If None, uses default.
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
             else "en",
             "speed": params.speed,
             "emotion": params.emotion,
+            "generation_config": params.generation_config,
         }
         self.set_voice(voice_id)
         self.set_model_name(model)
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
             if self._settings["speed"]:
                 payload["speed"] = self._settings["speed"]
+            if self._settings["generation_config"]:
+                payload["generation_config"] = self._settings["generation_config"].model_dump(
+                    exclude_none=True
+                )
             yield TTSStartedFrame()
             session = await self._client._get_session()

{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev830.dist-info → dv_pipecat_ai-0.0.85.dev832.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.85.dev830__py3-none-any.whl → 0.0.85.dev832__py3-none-any.whl

dv-pipecat-ai 0.0.85.dev830py3-none-any.whl → 0.0.85.dev832py3-none-any.whl