PyPI - cartesia - Versions diffs - 1.0.5__py2.py3-none-any.whl → 1.0.6__py2.py3-none-any.whl - Mend

cartesia 1.0.5py2.py3-none-any.whl → 1.0.6py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

cartesia/_types.py +24 -0
cartesia/client.py +93 -73
cartesia/version.py +1 -1
{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/METADATA +1 -1
cartesia-1.0.6.dist-info/RECORD +12 -0
cartesia-1.0.5.dist-info/RECORD +0 -12
{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/LICENSE.md +0 -0
{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/WHEEL +0 -0
{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/top_level.txt +0 -0

cartesia/_types.py CHANGED Viewed

@@ -70,7 +70,31 @@ class VoiceMetadata(TypedDict):
     language: str
+class VoiceControls(TypedDict):
+    """Defines different voice control parameters for voice synthesis.
+    For a complete list of supported parameters, refer to the Cartesia API documentation.
+    https://docs.cartesia.ai/getting-started/welcome
+    Examples:
+        >>> {"speed": "fastest"}
+        >>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
+        >>> {"emotion": "surprise:high, positivity:high"}
+    Note:
+        This is an experimental class and is subject to rapid change in future versions.
+    """
+    speed: str = ""
+    emotion: str = ""
 class OutputFormat(TypedDict):
     container: str
     encoding: str
     sample_rate: int
+class EventType:
+    NULL = ""
+    AUDIO = "chunk"
+    TIMESTAMPS = "timestamps"

cartesia/client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import base64
+from collections import defaultdict
 import json
 import os
 import uuid
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
 from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
 from cartesia._types import (
+    EventType,
     OutputFormat,
     OutputFormatMapping,
     DeprecatedOutputFormatMapping,
+    VoiceControls,
     VoiceMetadata,
 )
@@ -295,6 +298,7 @@ class _TTSContext:
         context_id: Optional[str] = None,
         duration: Optional[int] = None,
         language: Optional[str] = None,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Generator[bytes, None, None]:
         """Send audio generation requests to the WebSocket and yield responses.
@@ -307,6 +311,8 @@ class _TTSContext:
             context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
             duration: The duration of the audio in seconds.
             language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Yields:
             Dictionary containing the following key(s):
@@ -322,7 +328,7 @@ class _TTSContext:
         self._websocket.connect()
-        voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
         # Create the initial request body
         request_body = {
@@ -482,42 +488,16 @@ class _WebSocket:
     def _convert_response(
         self, response: Dict[str, any], include_context_id: bool
     ) -> Dict[str, Any]:
-        audio = base64.b64decode(response["data"])
-        optional_kwargs = {}
+        out = {}
+        if response["type"] == EventType.AUDIO:
+            out["audio"] = base64.b64decode(response["data"])
+        elif response["type"] == EventType.TIMESTAMPS:
+            out["word_timestamps"] = response["word_timestamps"]
         if include_context_id:
-            optional_kwargs["context_id"] = response["context_id"]
-        return {
-            "audio": audio,
-            **optional_kwargs,
-        }
-    def _validate_and_construct_voice(
-        self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
-    ) -> dict:
-        """Validate and construct the voice dictionary for the request.
-        Args:
-            voice_id: The ID of the voice to use for generating audio.
-            voice_embedding: The embedding of the voice to use for generating audio.
-        Returns:
-            A dictionary representing the voice configuration.
+            out["context_id"] = response["context_id"]
-        Raises:
-            ValueError: If neither or both voice_id and voice_embedding are specified.
-        """
-        if voice_id is None and voice_embedding is None:
-            raise ValueError("Either voice_id or voice_embedding must be specified.")
-        if voice_id is not None and voice_embedding is not None:
-            raise ValueError("Only one of voice_id or voice_embedding should be specified.")
-        if voice_id:
-            return {"mode": "id", "id": voice_id}
-        return {"mode": "embedding", "embedding": voice_embedding}
+        return out
     def send(
         self,
@@ -530,6 +510,8 @@ class _WebSocket:
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        add_timestamps: bool = False,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, Generator[bytes, None, None]]:
         """Send a request to the WebSocket to generate audio.
@@ -543,6 +525,9 @@ class _WebSocket:
             duration: The duration of the audio in seconds.
             language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
             stream: Whether to stream the audio or not.
+            add_timestamps: Whether to return word-level timestamps.
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Returns:
             If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -556,7 +541,7 @@ class _WebSocket:
         if context_id is None:
             context_id = str(uuid.uuid4())
-        voice = self._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
         request_body = {
             "model_id": model_id,
@@ -569,6 +554,7 @@ class _WebSocket:
             },
             "context_id": context_id,
             "language": language,
+            "add_timestamps": add_timestamps,
         }
         if duration is not None:
@@ -580,10 +566,17 @@ class _WebSocket:
             return generator
         chunks = []
+        word_timestamps = defaultdict(list)
         for chunk in generator:
-            chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "context_id": context_id}
+            if "audio" in chunk:
+                chunks.append(chunk["audio"])
+            if add_timestamps and "word_timestamps" in chunk:
+                for k, v in chunk["word_timestamps"].items():
+                    word_timestamps[k].extend(v)
+        out = {"audio": b"".join(chunks), "context_id": context_id}
+        if add_timestamps:
+            out["word_timestamps"] = word_timestamps
+        return out
     def _websocket_generator(self, request_body: Dict[str, Any]):
         self.websocket.send(json.dumps(request_body))
@@ -656,32 +649,6 @@ class _SSE:
                     break
         return buffer, outputs
-    def _validate_and_construct_voice(
-        self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
-    ) -> dict:
-        """Validate and construct the voice dictionary for the request.
-        Args:
-            voice_id: The ID of the voice to use for generating audio.
-            voice_embedding: The embedding of the voice to use for generating audio.
-        Returns:
-            A dictionary representing the voice configuration.
-        Raises:
-            ValueError: If neither or both voice_id and voice_embedding are specified.
-        """
-        if voice_id is None and voice_embedding is None:
-            raise ValueError("Either voice_id or voice_embedding must be specified.")
-        if voice_id is not None and voice_embedding is not None:
-            raise ValueError("Only one of voice_id or voice_embedding should be specified.")
-        if voice_id:
-            return {"mode": "id", "id": voice_id}
-        return {"mode": "embedding", "embedding": voice_embedding}
     def send(
         self,
         model_id: str,
@@ -692,6 +659,7 @@ class _SSE:
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, Generator[bytes, None, None]]:
         """Send a request to the server to generate audio using Server-Sent Events.
@@ -704,6 +672,8 @@ class _SSE:
             duration: The duration of the audio in seconds.
             language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
             stream: Whether to stream the audio or not.
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Returns:
             If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -711,8 +681,7 @@ class _SSE:
             Both the generator and the dictionary contain the following key(s):
             - audio: The audio as bytes.
         """
-        voice = self._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
         request_body = {
             "model_id": model_id,
             "transcript": transcript,
@@ -946,8 +915,9 @@ class _AsyncSSE(_SSE):
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, AsyncGenerator[bytes, None]]:
-        voice = self._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
         request_body = {
             "model_id": model_id,
@@ -1043,6 +1013,8 @@ class _AsyncTTSContext:
         continue_: bool = False,
         duration: Optional[int] = None,
         language: Optional[str] = None,
+        add_timestamps: bool = False,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> None:
         """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
@@ -1055,7 +1027,10 @@ class _AsyncTTSContext:
             context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
             continue_: Whether to continue the audio generation from the previous transcript or not.
             duration: The duration of the audio in seconds.
-            language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
+            language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
+            add_timestamps: Whether to return word-level timestamps.
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Returns:
             None.
@@ -1067,7 +1042,7 @@ class _AsyncTTSContext:
         await self._websocket.connect()
-        voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
         request_body = {
             "model_id": model_id,
@@ -1081,6 +1056,7 @@ class _AsyncTTSContext:
             "context_id": self._context_id,
             "continue": continue_,
             "language": language,
+            "add_timestamps": add_timestamps,
         }
         if duration is not None:
@@ -1234,7 +1210,10 @@ class _AsyncWebSocket(_WebSocket):
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        add_timestamps: bool = False,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, AsyncGenerator[bytes, None]]:
+        """See :meth:`_WebSocket.send` for details."""
         if context_id is None:
             context_id = str(uuid.uuid4())
@@ -1250,6 +1229,8 @@ class _AsyncWebSocket(_WebSocket):
             duration=duration,
             language=language,
             continue_=False,
+            add_timestamps = add_timestamps,
+            _experimental_voice_controls=_experimental_voice_controls,
         )
         generator = ctx.receive()
@@ -1258,10 +1239,17 @@ class _AsyncWebSocket(_WebSocket):
             return generator
         chunks = []
+        word_timestamps = defaultdict(list)
         async for chunk in generator:
-            chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "context_id": context_id}
+            if "audio" in chunk:
+                chunks.append(chunk["audio"])
+            if add_timestamps and "word_timestamps" in chunk:
+                for k, v in chunk["word_timestamps"].items():
+                    word_timestamps[k].extend(v)
+        out = {"audio": b"".join(chunks), "context_id": context_id}
+        if add_timestamps:
+            out["word_timestamps"] = word_timestamps
+        return out
     async def _process_responses(self):
         try:
@@ -1311,3 +1299,35 @@ class AsyncTTS(TTS):
         )
         await ws.connect()
         return ws
+def _validate_and_construct_voice(
+    voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
+) -> dict:
+    """Validate and construct the voice dictionary for the request.
+    Args:
+        voice_id: The ID of the voice to use for generating audio.
+        voice_embedding: The embedding of the voice to use for generating audio.
+        experimental_voice_controls: Voice controls for emotion and speed.
+            Note: This is an experimental feature and may rapidly change in the future.
+    Returns:
+        A dictionary representing the voice configuration.
+    Raises:
+        ValueError: If neither or both voice_id and voice_embedding are specified.
+    """
+    if voice_id is None and voice_embedding is None:
+        raise ValueError("Either voice_id or voice_embedding must be specified.")
+    if voice_id is not None and voice_embedding is not None:
+        raise ValueError("Only one of voice_id or voice_embedding should be specified.")
+    if voice_id:
+        voice = {"mode": "id", "id": voice_id}
+    else:
+        voice = {"mode": "embedding", "embedding": voice_embedding}
+    if experimental_voice_controls is not None:
+        voice["__experimental_controls"] = experimental_voice_controls
+    return voice

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.5"
1	+ __version__ = "1.0.6"

{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 1.0.5
+Version: 1.0.6
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.

cartesia-1.0.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
+cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
+cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
+cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
+cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
+cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
+cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
+cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
+cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
+cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
+cartesia-1.0.6.dist-info/RECORD,,

cartesia-1.0.5.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
-cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
-cartesia/client.py,sha256=46XiKTXa0gBXJ_GftMtLHAzBoX0GmWz_aWYuG68jaNQ,49316
-cartesia/version.py,sha256=B9kKWJLln1i8LjtkcYecvNWGLTrez4gCUOHtnPlInFo,22
-cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
-cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
-cartesia-1.0.5.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
-cartesia-1.0.5.dist-info/METADATA,sha256=PImHYCNoo7iSnm3Br6PuRdqvli92c7AyXR4iagdv-d8,18368
-cartesia-1.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
-cartesia-1.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
-cartesia-1.0.5.dist-info/RECORD,,

{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

cartesia 1.0.5__py2.py3-none-any.whl → 1.0.6__py2.py3-none-any.whl

cartesia 1.0.5py2.py3-none-any.whl → 1.0.6py2.py3-none-any.whl