PyPI - cartesia - Versions diffs - 0.0.2__py2.py3-none-any.whl → 0.0.3__py2.py3-none-any.whl - Mend

cartesia 0.0.2py2.py3-none-any.whl → 0.0.3py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

cartesia/tts.py +22 -17
cartesia/version.py +1 -1
{cartesia-0.0.2.dist-info → cartesia-0.0.3.dist-info}/METADATA +54 -11
cartesia-0.0.3.dist-info/RECORD +7 -0
cartesia-0.0.2.dist-info/RECORD +0 -7
{cartesia-0.0.2.dist-info → cartesia-0.0.3.dist-info}/WHEEL +0 -0
{cartesia-0.0.2.dist-info → cartesia-0.0.3.dist-info}/top_level.txt +0 -0

cartesia/tts.py CHANGED Viewed

@@ -4,7 +4,6 @@ import os
 import uuid
 from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
-import numpy as np
 import requests
 from websockets.sync.client import connect
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
 class AudioOutput(TypedDict):
-    audio: np.ndarray
+    audio: bytes
     sampling_rate: int
@@ -176,13 +175,26 @@ class CartesiaTTS:
     def _is_websocket_closed(self):
         return self.websocket.socket.fileno() == -1
+    def _check_inputs(
+        self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
+    ):
+        if chunk_time is not None:
+            if chunk_time < 0.1 or chunk_time > 0.5:
+                raise ValueError("`chunk_time` must be between 0.1 and 0.5")
+        if chunk_time is not None and duration is not None:
+            if duration < chunk_time:
+                raise ValueError("`duration` must be greater than chunk_time")
+        if transcript.strip() == "":
+            raise ValueError("`transcript` must be non empty")
     def generate(
         self,
         *,
         transcript: str,
         duration: int = None,
         chunk_time: float = None,
-        lookahead: int = None,
         voice: Embedding = None,
         stream: bool = False,
         websocket: bool = True,
@@ -194,8 +206,6 @@ class CartesiaTTS:
             duration: The maximum duration of the audio in seconds.
             chunk_time: How long each audio segment should be in seconds.
                 This should not need to be adjusted.
-            lookahead: The number of seconds to look ahead for each chunk.
-                This should not need to be adjusted.
             voice: The voice to use for generating audio.
                 This can either be a voice id (string) or an embedding vector (List[float]).
             stream: Whether to stream the audio or not.
@@ -206,18 +216,16 @@ class CartesiaTTS:
         Returns:
             A generator if `stream` is True, otherwise a dictionary.
             Dictionary from both generator and non-generator return types have the following keys:
-                * "audio": The audio as a 1D numpy array.
+                * "audio": The audio as a bytes buffer.
                 * "sampling_rate": The sampling rate of the audio.
         """
-        body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
+        self._check_inputs(transcript, duration, chunk_time)
-        if isinstance(voice, str):
-            voice = self._voices[voice]
+        body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
         optional_body = dict(
             duration=duration,
             chunk_time=chunk_time,
-            lookahead=lookahead,
             voice=voice,
         )
         body.update({k: v for k, v in optional_body.items() if v is not None})
@@ -237,7 +245,7 @@ class CartesiaTTS:
                 sampling_rate = chunk["sampling_rate"]
             chunks.append(chunk["audio"])
-        return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
+        return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
     def _generate_http(self, body: Dict[str, Any]):
         response = requests.post(
@@ -258,8 +266,7 @@ class CartesiaTTS:
                 if start_index != -1 and end_index != -1:
                     try:
                         chunk_json = json.loads(buffer[start_index : end_index + 1])
-                        data = base64.b64decode(chunk_json["data"])
-                        audio = np.frombuffer(data, dtype=np.float32)
+                        audio = base64.b64decode(chunk_json["data"])
                         yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
                         buffer = buffer[end_index + 1 :]
                     except json.JSONDecodeError:
@@ -268,8 +275,7 @@ class CartesiaTTS:
         if buffer:
             try:
                 chunk_json = json.loads(buffer)
-                data = base64.b64decode(chunk_json["data"])
-                audio = np.frombuffer(data, dtype=np.float32)
+                audio = base64.b64decode(chunk_json["data"])
                 yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
             except json.JSONDecodeError:
                 pass
@@ -282,8 +288,7 @@ class CartesiaTTS:
         try:
             response = json.loads(self.websocket.recv())
             while not response["done"]:
-                data = base64.b64decode(response["data"])
-                audio = np.frombuffer(data, dtype=np.float32)
+                audio = base64.b64decode(response["data"])
                 # print("timing", time.perf_counter() - start)
                 yield {"audio": audio, "sampling_rate": response["sampling_rate"]}

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.2"
1	+ __version__ = "0.0.3"

{cartesia-0.0.2.dist-info → cartesia-0.0.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 0.0.2
+Version: 0.0.3
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
 Description-Content-Type: text/markdown
 Requires-Dist: websockets
 Requires-Dist: requests
-Requires-Dist: numpy
 Provides-Extra: all
 Requires-Dist: pre-commit ; extra == 'all'
 Requires-Dist: docformatter ; extra == 'all'
@@ -49,22 +48,66 @@ pip install -e '.[dev]'
 ## Usage
 ```python
 from cartesia.tts import CartesiaTTS
-from IPython.display import Audio
+import pyaudio
+import os
 client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
 voices = client.get_voices()
-embedding = voices["Milo"]["embedding"]
+voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
 transcript = "Hello! Welcome to Cartesia"
-# No streaming
-output = client.generate(transcript=transcript, voice=embedding)
-Audio(output["audio"], rate=output["sampling_rate"])
+p = pyaudio.PyAudio()
+stream = None
-# Streaming
-for output in client.generate(transcript=transcript, voice=embedding, stream=True):
-    arr = output["audio"]  # a numpy array
+# Generate and stream audio
+for output in client.generate(transcript=transcript, voice=voice, stream=True):
+    buffer = output["audio"]
     rate = output["sampling_rate"]
+    if not stream:
+        stream = p.open(format=pyaudio.paFloat32,
+                        channels=1,
+                        rate=rate,
+                        output=True)
+    # Write the audio data to the stream
+    stream.write(buffer)
+stream.stop_stream()
+stream.close()
+p.terminate()
+```
+If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
+```python
+from cartesia.tts import CartesiaTTS
+from IPython.display import Audio
+import io
+import os
+client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
+voices = client.get_voices()
+voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
+transcript = "Hello! Welcome to Cartesia"
+# Create a BytesIO object to store the audio data
+audio_data = io.BytesIO()
+# Generate and stream audio
+for output in client.generate(transcript=transcript, voice=voice, stream=True):
+    buffer = output["audio"]
+    audio_data.write(buffer)
+# Set the cursor position to the beginning of the BytesIO object
+audio_data.seek(0)
+# Create an Audio object from the BytesIO data
+audio = Audio(audio_data, rate=output["sampling_rate"])
+# Display the Audio object
+display(audio)
 ```
 We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.

cartesia-0.0.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
+cartesia/tts.py,sha256=ABXW9rc8Pn0GTRvb_7DHZKMtbvhGUiqOgHmvztwlOnI,12033
+cartesia/version.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
+cartesia-0.0.3.dist-info/METADATA,sha256=VsCGL1sITbKqERihK2rzVm9WIY5EJ5nCS_CXQ0s14ns,3604
+cartesia-0.0.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
+cartesia-0.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
+cartesia-0.0.3.dist-info/RECORD,,

cartesia-0.0.2.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
-cartesia/tts.py,sha256=kQjkQhWfVrLFH6yaOb6G65HXtFDzPwLa6Q9AwVgIyCI,11901
-cartesia/version.py,sha256=QvlVh4JTl3JL7jQAja76yKtT-IvF4631ASjWY1wS6AQ,22
-cartesia-0.0.2.dist-info/METADATA,sha256=7BcDRyB4vxCWxcJhTbe_cWEQXCNOiEJdqoWh8WouNGs,2465
-cartesia-0.0.2.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-cartesia-0.0.2.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
-cartesia-0.0.2.dist-info/RECORD,,

{cartesia-0.0.2.dist-info → cartesia-0.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{cartesia-0.0.2.dist-info → cartesia-0.0.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

cartesia 0.0.2__py2.py3-none-any.whl → 0.0.3__py2.py3-none-any.whl

cartesia 0.0.2py2.py3-none-any.whl → 0.0.3py2.py3-none-any.whl