PyPI - cartesia - Versions diffs - 1.0.6__py2.py3-none-any.whl → 1.0.7__py2.py3-none-any.whl - Mend

cartesia 1.0.6py2.py3-none-any.whl → 1.0.7py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

cartesia/_types.py +6 -5
cartesia/client.py +60 -39
cartesia/version.py +1 -1
{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/METADATA +54 -1
cartesia-1.0.7.dist-info/RECORD +12 -0
cartesia-1.0.6.dist-info/RECORD +0 -12
{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/LICENSE.md +0 -0
{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/WHEEL +0 -0
{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/top_level.txt +0 -0

cartesia/_types.py CHANGED Viewed

@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
         "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
         "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
     }
     @classmethod
     @deprecated(
         vdeprecated="1.0.1",
@@ -74,18 +74,19 @@ class VoiceControls(TypedDict):
     """Defines different voice control parameters for voice synthesis.
     For a complete list of supported parameters, refer to the Cartesia API documentation.
-    https://docs.cartesia.ai/getting-started/welcome
+    https://docs.cartesia.ai/api-reference
     Examples:
         >>> {"speed": "fastest"}
-        >>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
-        >>> {"emotion": "surprise:high, positivity:high"}
+        >>> {"speed": "slow", "emotion": ["sadness:high"]}
+        >>> {"emotion": ["surprise:highest", "curiosity"]}
     Note:
         This is an experimental class and is subject to rapid change in future versions.
     """
     speed: str = ""
-    emotion: str = ""
+    emotion: List[str] = []
 class OutputFormat(TypedDict):

cartesia/client.py CHANGED Viewed

@@ -328,7 +328,11 @@ class _TTSContext:
         self._websocket.connect()
-        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         # Create the initial request body
         request_body = {
@@ -493,7 +497,7 @@ class _WebSocket:
             out["audio"] = base64.b64decode(response["data"])
         elif response["type"] == EventType.TIMESTAMPS:
             out["word_timestamps"] = response["word_timestamps"]
         if include_context_id:
             out["context_id"] = response["context_id"]
@@ -541,7 +545,11 @@ class _WebSocket:
         if context_id is None:
             context_id = str(uuid.uuid4())
-        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         request_body = {
             "model_id": model_id,
@@ -681,7 +689,11 @@ class _SSE:
             Both the generator and the dictionary contain the following key(s):
             - audio: The audio as bytes.
         """
-        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         request_body = {
             "model_id": model_id,
             "transcript": transcript,
@@ -795,6 +807,7 @@ class TTS(Resource):
             sample_rate=output_format_obj["sample_rate"],
         )
+    @staticmethod
     def get_sample_rate(self, output_format_name: str) -> int:
         """Convenience method to get the sample rate for a given output format.
@@ -818,6 +831,40 @@ class TTS(Resource):
         return output_format_obj["sample_rate"]
+    @staticmethod
+    def _validate_and_construct_voice(
+        voice_id: Optional[str] = None,
+        voice_embedding: Optional[List[float]] = None,
+        experimental_voice_controls: Optional[VoiceControls] = None,
+    ) -> dict:
+        """Validate and construct the voice dictionary for the request.
+        Args:
+            voice_id: The ID of the voice to use for generating audio.
+            voice_embedding: The embedding of the voice to use for generating audio.
+            experimental_voice_controls: Voice controls for emotion and speed.
+                Note: This is an experimental feature and may rapidly change in the future.
+        Returns:
+            A dictionary representing the voice configuration.
+        Raises:
+            ValueError: If neither or both voice_id and voice_embedding are specified.
+        """
+        if voice_id is None and voice_embedding is None:
+            raise ValueError("Either voice_id or voice_embedding must be specified.")
+        if voice_id is not None and voice_embedding is not None:
+            raise ValueError("Only one of voice_id or voice_embedding should be specified.")
+        if voice_id:
+            voice = {"mode": "id", "id": voice_id}
+        else:
+            voice = {"mode": "embedding", "embedding": voice_embedding}
+        if experimental_voice_controls is not None:
+            voice["__experimental_controls"] = experimental_voice_controls
+        return voice
 class AsyncCartesia(Cartesia):
     """The asynchronous version of the Cartesia client."""
@@ -917,7 +964,11 @@ class _AsyncSSE(_SSE):
         stream: bool = True,
         _experimental_voice_controls: Optional[VoiceControls] = None,
     ) -> Union[bytes, AsyncGenerator[bytes, None]]:
-        voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
+        voice = TTS._validate_and_construct_voice(
+            voice_id,
+            voice_embedding=voice_embedding,
+            experimental_voice_controls=_experimental_voice_controls,
+        )
         request_body = {
             "model_id": model_id,
@@ -1042,7 +1093,9 @@ class _AsyncTTSContext:
         await self._websocket.connect()
-        voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
+        voice = TTS._validate_and_construct_voice(
+            voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
+        )
         request_body = {
             "model_id": model_id,
@@ -1229,7 +1282,7 @@ class _AsyncWebSocket(_WebSocket):
             duration=duration,
             language=language,
             continue_=False,
-            add_timestamps = add_timestamps,
+            add_timestamps=add_timestamps,
             _experimental_voice_controls=_experimental_voice_controls,
         )
@@ -1299,35 +1352,3 @@ class AsyncTTS(TTS):
         )
         await ws.connect()
         return ws
-def _validate_and_construct_voice(
-    voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
-) -> dict:
-    """Validate and construct the voice dictionary for the request.
-    Args:
-        voice_id: The ID of the voice to use for generating audio.
-        voice_embedding: The embedding of the voice to use for generating audio.
-        experimental_voice_controls: Voice controls for emotion and speed.
-            Note: This is an experimental feature and may rapidly change in the future.
-    Returns:
-        A dictionary representing the voice configuration.
-    Raises:
-        ValueError: If neither or both voice_id and voice_embedding are specified.
-    """
-    if voice_id is None and voice_embedding is None:
-        raise ValueError("Either voice_id or voice_embedding must be specified.")
-    if voice_id is not None and voice_embedding is not None:
-        raise ValueError("Only one of voice_id or voice_embedding should be specified.")
-    if voice_id:
-        voice = {"mode": "id", "id": voice_id}
-    else:
-        voice = {"mode": "embedding", "embedding": voice_embedding}
-    if experimental_voice_controls is not None:
-        voice["__experimental_controls"] = experimental_voice_controls
-    return voice

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.6"
1	+ __version__ = "1.0.7"

{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 1.0.6
+Version: 1.0.7
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.
@@ -419,6 +419,34 @@ p.terminate()
 ws.close()  # Close the websocket connection
 ```
+### Generating timestamps using WebSocket
+The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
+- words (list): The individual words in the transcript.
+- start (list): The starting timestamp for each word (in seconds).
+- end (list): The ending timestamp for each word (in seconds).
+```python
+response = ws.send(
+    model_id=model_id,
+    transcript=transcript,
+    voice_id=voice_id,
+    output_format=output_format,
+    stream=False,
+    add_timestamps=True
+)
+# Accessing the word_timestamps object
+word_timestamps = response['word_timestamps']
+words = word_timestamps['words']
+start_times = word_timestamps['start']
+end_times = word_timestamps['end']
+for word, start, end in zip(words, start_times, end_times):
+    print(f"Word: {word}, Start: {start}, End: {end}")
+```
 ### Multilingual Text-to-Speech [Alpha]
 You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -472,6 +500,31 @@ stream.close()
 p.terminate()
 ```
+### Speed and Emotion Control [Experimental]
+You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
+Speed Options:
+- `slowest`, `slow`, `normal`, `fast`, `fastest`
+Emotion Options:
+Use a list of tags in the format `emotion_name:level` where:
+- Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
+- Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
+The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
+```python
+ws.send(
+    model_id=model_id,
+    transcript=transcript,
+    voice_id=voice_id,
+    output_format=output_format,
+    _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
+)
+```
+### Jupyter Notebook Usage
 If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
 Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).

cartesia-1.0.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
+cartesia/_types.py,sha256=Lcp4GOot5UfI0EveDi2QdNALMo1rK4PwUrtMvW5P6vY,4406
+cartesia/client.py,sha256=1T_HboqHZO6wjUDYpuWI7igV-QF_cRL4DY7v4NDzApo,51871
+cartesia/version.py,sha256=BW7SWRpHoxuOQZ67pS20yog2LWYl-nK7-BEFBNrHGgA,22
+cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
+cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
+cartesia-1.0.7.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
+cartesia-1.0.7.dist-info/METADATA,sha256=vvU7-K0raiw4hmotlST5wi6uSnGiXjMpHxd2CIzvbMc,20336
+cartesia-1.0.7.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
+cartesia-1.0.7.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
+cartesia-1.0.7.dist-info/RECORD,,

cartesia-1.0.6.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
-cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
-cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
-cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
-cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
-cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
-cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
-cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
-cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
-cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
-cartesia-1.0.6.dist-info/RECORD,,

{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{cartesia-1.0.6.dist-info → cartesia-1.0.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

cartesia 1.0.6__py2.py3-none-any.whl → 1.0.7__py2.py3-none-any.whl

cartesia 1.0.6py2.py3-none-any.whl → 1.0.7py2.py3-none-any.whl