PyPI - cartesia - Versions diffs - 1.0.5__py2.py3-none-any.whl → 1.0.7__py2.py3-none-any.whl - Mend

cartesia 1.0.5py2.py3-none-any.whl → 1.0.7py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

cartesia/_types.py +26 -1
cartesia/client.py +113 -72
cartesia/version.py +1 -1
{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/METADATA +54 -1
cartesia-1.0.7.dist-info/RECORD +12 -0
cartesia-1.0.5.dist-info/RECORD +0 -12
{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/LICENSE.md +0 -0
{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/WHEEL +0 -0
{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/top_level.txt +0 -0

cartesia/_types.py CHANGED Viewed

@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
         "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
         "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
     }
     @classmethod
     @deprecated(
         vdeprecated="1.0.1",
@@ -70,7 +70,32 @@ class VoiceMetadata(TypedDict):
     language: str
+class VoiceControls(TypedDict):
+    """Defines different voice control parameters for voice synthesis.
+    For a complete list of supported parameters, refer to the Cartesia API documentation.
+    https://docs.cartesia.ai/api-reference
+    Examples:
+        >>> {"speed": "fastest"}
+        >>> {"speed": "slow", "emotion": ["sadness:high"]}
+        >>> {"emotion": ["surprise:highest", "curiosity"]}
+    Note:
+        This is an experimental class and is subject to rapid change in future versions.
+    """
+    speed: str = ""
+    emotion: List[str] = []
 class OutputFormat(TypedDict):
     container: str
     encoding: str
     sample_rate: int
+class EventType:
+    NULL = ""
+    AUDIO = "chunk"
+    TIMESTAMPS = "timestamps"

cartesia/client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import base64
+from collections import defaultdict
 import json
 import os
 import uuid
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
 from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
 from cartesia._types import (
+    EventType,
     OutputFormat,
     OutputFormatMapping,
     DeprecatedOutputFormatMapping,
+    VoiceControls,
     VoiceMetadata,
 )
@@ -295,6 +298,7 @@ class _TTSContext:
         context_id: Optional[str] = None,
         duration: Optional[int] = None,
         language: Optional[str] = None,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Generator[bytes, None, None]:
         """Send audio generation requests to the WebSocket and yield responses.
@@ -307,6 +311,8 @@ class _TTSContext:
             context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
             duration: The duration of the audio in seconds.
             language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Yields:
             Dictionary containing the following key(s):
@@ -322,7 +328,11 @@ class _TTSContext:
         self._websocket.connect()
-        voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         # Create the initial request body
         request_body = {
@@ -482,42 +492,16 @@ class _WebSocket:
     def _convert_response(
         self, response: Dict[str, any], include_context_id: bool
     ) -> Dict[str, Any]:
-        audio = base64.b64decode(response["data"])
+        out = {}
+        if response["type"] == EventType.AUDIO:
+            out["audio"] = base64.b64decode(response["data"])
+        elif response["type"] == EventType.TIMESTAMPS:
+            out["word_timestamps"] = response["word_timestamps"]
-        optional_kwargs = {}
         if include_context_id:
-            optional_kwargs["context_id"] = response["context_id"]
-        return {
-            "audio": audio,
-            **optional_kwargs,
-        }
-    def _validate_and_construct_voice(
-        self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
-    ) -> dict:
-        """Validate and construct the voice dictionary for the request.
-        Args:
-            voice_id: The ID of the voice to use for generating audio.
-            voice_embedding: The embedding of the voice to use for generating audio.
+            out["context_id"] = response["context_id"]
-        Returns:
-            A dictionary representing the voice configuration.
-        Raises:
-            ValueError: If neither or both voice_id and voice_embedding are specified.
-        """
-        if voice_id is None and voice_embedding is None:
-            raise ValueError("Either voice_id or voice_embedding must be specified.")
-        if voice_id is not None and voice_embedding is not None:
-            raise ValueError("Only one of voice_id or voice_embedding should be specified.")
-        if voice_id:
-            return {"mode": "id", "id": voice_id}
-        return {"mode": "embedding", "embedding": voice_embedding}
+        return out
     def send(
         self,
@@ -530,6 +514,8 @@ class _WebSocket:
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        add_timestamps: bool = False,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, Generator[bytes, None, None]]:
         """Send a request to the WebSocket to generate audio.
@@ -543,6 +529,9 @@ class _WebSocket:
             duration: The duration of the audio in seconds.
             language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
             stream: Whether to stream the audio or not.
+            add_timestamps: Whether to return word-level timestamps.
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Returns:
             If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -556,7 +545,11 @@ class _WebSocket:
         if context_id is None:
             context_id = str(uuid.uuid4())
-        voice = self._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         request_body = {
             "model_id": model_id,
@@ -569,6 +562,7 @@ class _WebSocket:
             },
             "context_id": context_id,
             "language": language,
+            "add_timestamps": add_timestamps,
         }
         if duration is not None:
@@ -580,10 +574,17 @@ class _WebSocket:
             return generator
         chunks = []
+        word_timestamps = defaultdict(list)
         for chunk in generator:
-            chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "context_id": context_id}
+            if "audio" in chunk:
+                chunks.append(chunk["audio"])
+            if add_timestamps and "word_timestamps" in chunk:
+                for k, v in chunk["word_timestamps"].items():
+                    word_timestamps[k].extend(v)
+        out = {"audio": b"".join(chunks), "context_id": context_id}
+        if add_timestamps:
+            out["word_timestamps"] = word_timestamps
+        return out
     def _websocket_generator(self, request_body: Dict[str, Any]):
         self.websocket.send(json.dumps(request_body))
@@ -656,32 +657,6 @@ class _SSE:
                     break
         return buffer, outputs
-    def _validate_and_construct_voice(
-        self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
-    ) -> dict:
-        """Validate and construct the voice dictionary for the request.
-        Args:
-            voice_id: The ID of the voice to use for generating audio.
-            voice_embedding: The embedding of the voice to use for generating audio.
-        Returns:
-            A dictionary representing the voice configuration.
-        Raises:
-            ValueError: If neither or both voice_id and voice_embedding are specified.
-        """
-        if voice_id is None and voice_embedding is None:
-            raise ValueError("Either voice_id or voice_embedding must be specified.")
-        if voice_id is not None and voice_embedding is not None:
-            raise ValueError("Only one of voice_id or voice_embedding should be specified.")
-        if voice_id:
-            return {"mode": "id", "id": voice_id}
-        return {"mode": "embedding", "embedding": voice_embedding}
     def send(
         self,
         model_id: str,
@@ -692,6 +667,7 @@ class _SSE:
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, Generator[bytes, None, None]]:
         """Send a request to the server to generate audio using Server-Sent Events.
@@ -704,6 +680,8 @@ class _SSE:
             duration: The duration of the audio in seconds.
             language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
             stream: Whether to stream the audio or not.
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Returns:
             If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -711,8 +689,11 @@ class _SSE:
             Both the generator and the dictionary contain the following key(s):
             - audio: The audio as bytes.
         """
-        voice = self._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         request_body = {
             "model_id": model_id,
             "transcript": transcript,
@@ -826,6 +807,7 @@ class TTS(Resource):
             sample_rate=output_format_obj["sample_rate"],
         )
+    @staticmethod
     def get_sample_rate(self, output_format_name: str) -> int:
         """Convenience method to get the sample rate for a given output format.
@@ -849,6 +831,40 @@ class TTS(Resource):
         return output_format_obj["sample_rate"]
+    @staticmethod
+    def _validate_and_construct_voice(
+        voice_id: Optional[str] = None,
+        voice_embedding: Optional[List[float]] = None,
+        experimental_voice_controls: Optional[VoiceControls] = None,
+    ) -> dict:
+        """Validate and construct the voice dictionary for the request.
+        Args:
+            voice_id: The ID of the voice to use for generating audio.
+            voice_embedding: The embedding of the voice to use for generating audio.
+            experimental_voice_controls: Voice controls for emotion and speed.
+                Note: This is an experimental feature and may rapidly change in the future.
+        Returns:
+            A dictionary representing the voice configuration.
+        Raises:
+            ValueError: If neither or both voice_id and voice_embedding are specified.
+        """
+        if voice_id is None and voice_embedding is None:
+            raise ValueError("Either voice_id or voice_embedding must be specified.")
+        if voice_id is not None and voice_embedding is not None:
+            raise ValueError("Only one of voice_id or voice_embedding should be specified.")
+        if voice_id:
+            voice = {"mode": "id", "id": voice_id}
+        else:
+            voice = {"mode": "embedding", "embedding": voice_embedding}
+        if experimental_voice_controls is not None:
+            voice["__experimental_controls"] = experimental_voice_controls
+        return voice
 class AsyncCartesia(Cartesia):
     """The asynchronous version of the Cartesia client."""
@@ -946,8 +962,13 @@ class _AsyncSSE(_SSE):
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, AsyncGenerator[bytes, None]]:
-        voice = self._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         request_body = {
             "model_id": model_id,
@@ -1043,6 +1064,8 @@ class _AsyncTTSContext:
         continue_: bool = False,
         duration: Optional[int] = None,
         language: Optional[str] = None,
+        add_timestamps: bool = False,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> None:
         """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
@@ -1055,7 +1078,10 @@ class _AsyncTTSContext:
             context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
             continue_: Whether to continue the audio generation from the previous transcript or not.
             duration: The duration of the audio in seconds.
-            language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
+            language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
+            add_timestamps: Whether to return word-level timestamps.
+            _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
+                Note: This is an experimental feature and may change rapidly in future releases.
         Returns:
             None.
@@ -1067,7 +1093,9 @@ class _AsyncTTSContext:
         await self._websocket.connect()
-        voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
+        voice = TTS._validate_and_construct_voice(
+            voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
+        )
         request_body = {
             "model_id": model_id,
@@ -1081,6 +1109,7 @@ class _AsyncTTSContext:
             "context_id": self._context_id,
             "continue": continue_,
             "language": language,
+            "add_timestamps": add_timestamps,
         }
         if duration is not None:
@@ -1234,7 +1263,10 @@ class _AsyncWebSocket(_WebSocket):
         duration: Optional[int] = None,
         language: Optional[str] = None,
         stream: bool = True,
+        add_timestamps: bool = False,
+        _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, AsyncGenerator[bytes, None]]:
+        """See :meth:`_WebSocket.send` for details."""
         if context_id is None:
             context_id = str(uuid.uuid4())
@@ -1250,6 +1282,8 @@ class _AsyncWebSocket(_WebSocket):
             duration=duration,
             language=language,
             continue_=False,
+            add_timestamps=add_timestamps,
+            _experimental_voice_controls=_experimental_voice_controls,
         )
         generator = ctx.receive()
@@ -1258,10 +1292,17 @@ class _AsyncWebSocket(_WebSocket):
             return generator
         chunks = []
+        word_timestamps = defaultdict(list)
         async for chunk in generator:
-            chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "context_id": context_id}
+            if "audio" in chunk:
+                chunks.append(chunk["audio"])
+            if add_timestamps and "word_timestamps" in chunk:
+                for k, v in chunk["word_timestamps"].items():
+                    word_timestamps[k].extend(v)
+        out = {"audio": b"".join(chunks), "context_id": context_id}
+        if add_timestamps:
+            out["word_timestamps"] = word_timestamps
+        return out
     async def _process_responses(self):
         try:

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.5"
1	+ __version__ = "1.0.7"

{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 1.0.5
+Version: 1.0.7
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.
@@ -419,6 +419,34 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+### Generating timestamps using WebSocket
+The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
+- words (list): The individual words in the transcript.
+- start (list): The starting timestamp for each word (in seconds).
+- end (list): The ending timestamp for each word (in seconds).
+```python
+response = ws.send(
+    model_id=model_id,
+    transcript=transcript,
+    voice_id=voice_id,
+    output_format=output_format,
+    stream=False,
+    add_timestamps=True
+)
+# Accessing the word_timestamps object
+word_timestamps = response['word_timestamps']
+words = word_timestamps['words']
+start_times = word_timestamps['start']
+end_times = word_timestamps['end']
+for word, start, end in zip(words, start_times, end_times):
+    print(f"Word: {word}, Start: {start}, End: {end}")
+```
 ### Multilingual Text-to-Speech [Alpha]
 You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -472,6 +500,31 @@ stream.close()
 p.terminate()
 ```
+### Speed and Emotion Control [Experimental]
+You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
+Speed Options:
+- `slowest`, `slow`, `normal`, `fast`, `fastest`
+Emotion Options:
+Use a list of tags in the format `emotion_name:level` where:
+- Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
+- Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
+The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
+```python
+ws.send(
+    model_id=model_id,
+    transcript=transcript,
+    voice_id=voice_id,
+    output_format=output_format,
+    _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
+)
+```
+### Jupyter Notebook Usage
 If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
 Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).

cartesia-1.0.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
+cartesia/_types.py,sha256=Lcp4GOot5UfI0EveDi2QdNALMo1rK4PwUrtMvW5P6vY,4406
+cartesia/client.py,sha256=1T_HboqHZO6wjUDYpuWI7igV-QF_cRL4DY7v4NDzApo,51871
+cartesia/version.py,sha256=BW7SWRpHoxuOQZ67pS20yog2LWYl-nK7-BEFBNrHGgA,22
+cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
+cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
+cartesia-1.0.7.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
+cartesia-1.0.7.dist-info/METADATA,sha256=vvU7-K0raiw4hmotlST5wi6uSnGiXjMpHxd2CIzvbMc,20336
+cartesia-1.0.7.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
+cartesia-1.0.7.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
+cartesia-1.0.7.dist-info/RECORD,,

cartesia-1.0.5.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
-cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
-cartesia/client.py,sha256=46XiKTXa0gBXJ_GftMtLHAzBoX0GmWz_aWYuG68jaNQ,49316
-cartesia/version.py,sha256=B9kKWJLln1i8LjtkcYecvNWGLTrez4gCUOHtnPlInFo,22
-cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
-cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
-cartesia-1.0.5.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
-cartesia-1.0.5.dist-info/METADATA,sha256=PImHYCNoo7iSnm3Br6PuRdqvli92c7AyXR4iagdv-d8,18368
-cartesia-1.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
-cartesia-1.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
-cartesia-1.0.5.dist-info/RECORD,,

{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

cartesia 1.0.5__py2.py3-none-any.whl → 1.0.7__py2.py3-none-any.whl

cartesia 1.0.5py2.py3-none-any.whl → 1.0.7py2.py3-none-any.whl