PyPI - cartesia - Versions diffs - 0.0.3__py2.py3-none-any.whl → 0.0.5rc1__py2.py3-none-any.whl - Mend

cartesia 0.0.3py2.py3-none-any.whl → 0.0.5rc1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

cartesia/tts.py +256 -40
cartesia/version.py +1 -1
{cartesia-0.0.3.dist-info → cartesia-0.0.5rc1.dist-info}/METADATA +41 -4
cartesia-0.0.5rc1.dist-info/RECORD +7 -0
cartesia-0.0.3.dist-info/RECORD +0 -7
{cartesia-0.0.3.dist-info → cartesia-0.0.5rc1.dist-info}/WHEEL +0 -0
{cartesia-0.0.3.dist-info → cartesia-0.0.5rc1.dist-info}/top_level.txt +0 -0

cartesia/tts.py CHANGED Viewed

@@ -1,15 +1,20 @@
+import asyncio
 import base64
 import json
 import os
 import uuid
-from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
+from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
+import aiohttp
+import httpx
 import requests
 from websockets.sync.client import connect
 DEFAULT_MODEL_ID = "genial-planet-1346"
 DEFAULT_BASE_URL = "api.cartesia.ai"
 DEFAULT_API_VERSION = "v0"
+DEFAULT_TIMEOUT = 60  # seconds
+DEFAULT_NUM_CONNECTIONS = 10  # connections per client
 class AudioOutput(TypedDict):
@@ -27,11 +32,46 @@ class VoiceMetadata(TypedDict):
     embedding: Optional[Embedding]
+def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
+    buffer += chunk_bytes.decode("utf-8")
+    outputs = []
+    while "{" in buffer and "}" in buffer:
+        start_index = buffer.find("{")
+        end_index = buffer.find("}", start_index)
+        if start_index != -1 and end_index != -1:
+            try:
+                chunk_json = json.loads(buffer[start_index : end_index + 1])
+                audio = base64.b64decode(chunk_json["data"])
+                outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
+                buffer = buffer[end_index + 1 :]
+            except json.JSONDecodeError:
+                break
+    return buffer, outputs
+def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
+    audio = base64.b64decode(response["data"])
+    optional_kwargs = {}
+    if include_context_id:
+        optional_kwargs["context_id"] = response["context_id"]
+    return {
+        "audio": audio,
+        "sampling_rate": response["sampling_rate"],
+        **optional_kwargs,
+    }
 class CartesiaTTS:
     """The client for Cartesia's text-to-speech library.
     This client contains methods to interact with the Cartesia text-to-speech API.
-    The API offers
+    The client can be used to retrieve available voices, compute new voice embeddings,
+    and generate speech from text.
+    The client also supports generating audio using a websocket for lower latency.
+    To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
     Examples:
@@ -55,18 +95,22 @@ class CartesiaTTS:
         ...     audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
     """
-    def __init__(self, *, api_key: str = None):
+    def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
         """
         Args:
             api_key: The API key to use for authorization.
                 If not specified, the API key will be read from the environment variable
                 `CARTESIA_API_KEY`.
+            experimental_ws_handle_interrupts: Whether to handle interrupts when generating
+                audio using the websocket. This is an experimental feature and may have bugs
+                or be deprecated in the future.
         """
         self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
         self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
         self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
         self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
         self.websocket = None
+        self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
         self.refresh_websocket()
     def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
@@ -100,9 +144,9 @@ class CartesiaTTS:
             >>> audio = client.generate(transcript="Hello world!", voice=embedding)
         """
         params = {"select": "id, name, description"} if skip_embeddings else None
-        response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
+        response = httpx.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
-        if response.status_code != 200:
+        if not response.is_success:
             raise ValueError(f"Failed to get voices. Error: {response.text}")
         voices = response.json()
@@ -134,20 +178,20 @@ class CartesiaTTS:
         if voice_id:
             url = f"{self._http_url()}/voices/embedding/{voice_id}"
-            response = requests.get(url, headers=self.headers)
+            response = httpx.get(url, headers=self.headers)
         elif filepath:
             url = f"{self._http_url()}/voices/clone/clip"
             files = {"clip": open(filepath, "rb")}
             headers = self.headers.copy()
             # The default content type of JSON is incorrect for file uploads
             headers.pop("Content-Type")
-            response = requests.post(url, headers=headers, files=files)
+            response = httpx.post(url, headers=headers, files=files)
         elif link:
             url = f"{self._http_url()}/voices/clone/url"
             params = {"link": link}
-            response = requests.post(url, headers=self.headers, params=params)
+            response = httpx.post(url, headers=self.headers, params=params)
-        if response.status_code != 200:
+        if not response.is_success:
             raise ValueError(
                 f"Failed to clone voice. Status Code: {response.status_code}\n"
                 f"Error: {response.text}"
@@ -167,8 +211,11 @@ class CartesiaTTS:
         """
         if self.websocket and not self._is_websocket_closed():
             self.websocket.close()
+        route = "audio/websocket"
+        if self.experimental_ws_handle_interrupts:
+            route = f"experimental/{route}"
         self.websocket = connect(
-            f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
+            f"{self._ws_url()}/{route}?api_key={self.api_key}",
             close_timeout=None,
         )
@@ -189,6 +236,29 @@ class CartesiaTTS:
         if transcript.strip() == "":
             raise ValueError("`transcript` must be non empty")
+    def _generate_request_body(
+        self,
+        *,
+        transcript: str,
+        duration: int = None,
+        chunk_time: float = None,
+        voice: Embedding = None,
+    ) -> Dict[str, Any]:
+        """
+        Create the request body for a stream request.
+        Note that anything that's not provided will use a default if available or be filtered out otherwise.
+        """
+        body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=voice)
+        optional_body = dict(
+            duration=duration,
+            chunk_time=chunk_time,
+            voice=voice,
+        )
+        body.update({k: v for k, v in optional_body.items() if v is not None})
+        return body
     def generate(
         self,
         *,
@@ -221,14 +291,9 @@ class CartesiaTTS:
         """
         self._check_inputs(transcript, duration, chunk_time)
-        body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
-        optional_body = dict(
-            duration=duration,
-            chunk_time=chunk_time,
-            voice=voice,
+        body = self._generate_request_body(
+            transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
         )
-        body.update({k: v for k, v in optional_body.items() if v is not None})
         if websocket:
             generator = self._generate_ws(body)
@@ -254,23 +319,14 @@ class CartesiaTTS:
             data=json.dumps(body),
             headers=self.headers,
         )
-        if response.status_code != 200:
+        if not response.ok:
             raise ValueError(f"Failed to generate audio. {response.text}")
         buffer = ""
         for chunk_bytes in response.iter_content(chunk_size=None):
-            buffer += chunk_bytes.decode("utf-8")
-            while "{" in buffer and "}" in buffer:
-                start_index = buffer.find("{")
-                end_index = buffer.find("}", start_index)
-                if start_index != -1 and end_index != -1:
-                    try:
-                        chunk_json = json.loads(buffer[start_index : end_index + 1])
-                        audio = base64.b64decode(chunk_json["data"])
-                        yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
-                        buffer = buffer[end_index + 1 :]
-                    except json.JSONDecodeError:
-                        break
+            buffer, outputs = update_buffer(buffer, chunk_bytes)
+            for output in outputs:
+                yield output
         if buffer:
             try:
@@ -280,21 +336,41 @@ class CartesiaTTS:
             except json.JSONDecodeError:
                 pass
-    def _generate_ws(self, body: Dict[str, Any]):
+    def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
+        """Generate audio using the websocket connection.
+        Args:
+            body: The request body.
+            context_id: The context id for the request.
+                The context id must be globally unique for the duration this client exists.
+                If this is provided, the context id that is in the response will
+                also be returned as part of the dict. This is helpful for testing.
+        """
         if not self.websocket or self._is_websocket_closed():
             self.refresh_websocket()
-        self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
+        include_context_id = bool(context_id)
+        if context_id is None:
+            context_id = uuid.uuid4().hex
+        self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
         try:
-            response = json.loads(self.websocket.recv())
-            while not response["done"]:
-                audio = base64.b64decode(response["data"])
-                # print("timing", time.perf_counter() - start)
-                yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
+            while True:
                 response = json.loads(self.websocket.recv())
-        except Exception:
-            raise RuntimeError(f"Failed to generate audio. {response}")
+                if response["done"]:
+                    break
+                yield convert_response(response, include_context_id)
+                if self.experimental_ws_handle_interrupts:
+                    self.websocket.send(json.dumps({"context_id": context_id}))
+        except GeneratorExit:
+            # The exit is only called when the generator is garbage collected.
+            # It may not be called directly after a break statement.
+            # However, the generator will be automatically cancelled on the next request.
+            if self.experimental_ws_handle_interrupts:
+                self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate audio. {response}") from e
     def _http_url(self):
         prefix = "http" if "localhost" in self.base_url else "https"
@@ -307,3 +383,143 @@ class CartesiaTTS:
     def __del__(self):
         if self.websocket.socket.fileno() > -1:
             self.websocket.close()
+class AsyncCartesiaTTS(CartesiaTTS):
+    def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
+        self.timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
+        self.connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
+        self._session = aiohttp.ClientSession(timeout=self.timeout, connector=self.connector)
+        super().__init__(
+            api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
+        )
+    def refresh_websocket(self):
+        pass  # do not load the websocket for the client until asynchronously when it is needed
+    async def _async_refresh_websocket(self):
+        """Refresh the websocket connection."""
+        if self.websocket and not self._is_websocket_closed():
+            self.websocket.close()
+        route = "audio/websocket"
+        if self.experimental_ws_handle_interrupts:
+            route = f"experimental/{route}"
+        self.websocket = await self._session.ws_connect(
+            f"{self._ws_url()}/{route}?api_key={self.api_key}"
+        )
+    async def generate(
+        self,
+        *,
+        transcript: str,
+        duration: int = None,
+        chunk_time: float = None,
+        voice: Embedding = None,
+        stream: bool = False,
+        websocket: bool = True,
+    ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
+        """Asynchronously generate audio from a transcript.
+        NOTE: This overrides the non-asynchronous generate method from the base class.
+        Args:
+            transcript: The text to generate audio for.
+            voice: The embedding to use for generating audio.
+            options: The options to use for generating audio. See :class:`GenerateOptions`.
+        Returns:
+            A dictionary containing the following:
+                * "audio": The audio as a 1D numpy array.
+                * "sampling_rate": The sampling rate of the audio.
+        """
+        body = self._generate_request_body(
+            transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
+        )
+        if websocket:
+            generator = self._generate_ws(body)
+        else:
+            generator = self._generate_http(body)
+        if stream:
+            return generator
+        chunks = []
+        sampling_rate = None
+        async for chunk in generator:
+            if sampling_rate is None:
+                sampling_rate = chunk["sampling_rate"]
+            chunks.append(chunk["audio"])
+        return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
+    async def _generate_http(self, body: Dict[str, Any]):
+        async with self._session.post(
+            f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
+        ) as response:
+            if response.status < 200 or response.status >= 300:
+                raise ValueError(f"Failed to generate audio. {response.text}")
+            buffer = ""
+            async for chunk_bytes in response.content.iter_any():
+                buffer, outputs = update_buffer(buffer, chunk_bytes)
+                for output in outputs:
+                    yield output
+            if buffer:
+                try:
+                    chunk_json = json.loads(buffer)
+                    audio = base64.b64decode(chunk_json["data"])
+                    yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
+                except json.JSONDecodeError:
+                    pass
+    async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
+        include_context_id = bool(context_id)
+        route = "audio/websocket"
+        if self.experimental_ws_handle_interrupts:
+            route = f"experimental/{route}"
+        if not self.websocket or self._is_websocket_closed():
+            await self._async_refresh_websocket()
+        ws = self.websocket
+        if context_id is None:
+            context_id = uuid.uuid4().hex
+        await ws.send_json({"data": body, "context_id": context_id})
+        try:
+            response = None
+            while True:
+                response = await ws.receive_json()
+                if response["done"]:
+                    break
+                yield convert_response(response, include_context_id)
+                if self.experimental_ws_handle_interrupts:
+                    await ws.send_json({"context_id": context_id})
+        except GeneratorExit:
+            # The exit is only called when the generator is garbage collected.
+            # It may not be called directly after a break statement.
+            # However, the generator will be automatically cancelled on the next request.
+            if self.experimental_ws_handle_interrupts:
+                await ws.send_json({"context_id": context_id, "action": "cancel"})
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate audio. {response}") from e
+    def _is_websocket_closed(self):
+        return self.websocket.closed
+    async def cleanup(self):
+        if self.websocket is not None and not self._is_websocket_closed():
+            await self.websocket.close()
+        if not self._session.closed:
+            await self._session.close()
+    def __del__(self):
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+        if loop is None:
+            asyncio.run(self.cleanup())
+        else:
+            loop.create_task(self.cleanup())

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.3"
1	+ __version__ = "0.0.5rc1"

{cartesia-0.0.3.dist-info → cartesia-0.0.5rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 0.0.3
+Version: 0.0.5rc1
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.
@@ -10,8 +10,11 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.8.0
 Description-Content-Type: text/markdown
-Requires-Dist: websockets
+Requires-Dist: aiohttp
+Requires-Dist: httpx
+Requires-Dist: pytest-asyncio
 Requires-Dist: requests
+Requires-Dist: websockets
 Provides-Extra: all
 Requires-Dist: pre-commit ; extra == 'all'
 Requires-Dist: docformatter ; extra == 'all'
@@ -21,6 +24,7 @@ Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
 Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
 Requires-Dist: pytest >=8.0.2 ; extra == 'all'
 Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
+Requires-Dist: twine ; extra == 'all'
 Provides-Extra: dev
 Requires-Dist: pre-commit ; extra == 'dev'
 Requires-Dist: docformatter ; extra == 'dev'
@@ -30,6 +34,7 @@ Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
 Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
 Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
 Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
+Requires-Dist: twine ; extra == 'dev'
 # Cartesia Python API Library
@@ -104,10 +109,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
 audio_data.seek(0)
 # Create an Audio object from the BytesIO data
-audio = Audio(audio_data, rate=output["sampling_rate"])
+audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
+# Display the Audio object
+display(audio)
+```
+You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
+```python
+from cartesia.tts import AsyncCartesiaTTS
+from IPython.display import Audio
+import io
+import os
+client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
+voices = client.get_voices()
+voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
+transcript = "Hello! Welcome to Cartesia"
+# Create a BytesIO object to store the audio data
+audio_data = io.BytesIO()
+# Generate and stream audio
+async for output in client.generate(transcript=transcript, voice=voice, stream=True):
+    buffer = output["audio"]
+    audio_data.write(buffer)
+# Set the cursor position to the beginning of the BytesIO object
+audio_data.seek(0)
+# Create an Audio object from the BytesIO data
+audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
 # Display the Audio object
 display(audio)
 ```
-We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
+To avoid storing your API key in the source code, we recommend doing one of the following:
+1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
+1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)

cartesia-0.0.5rc1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
+cartesia/tts.py,sha256=yPLz41AR0oAYPUNW48mqmwEEbLBHCnbaK_wPT0iFBVk,20543
+cartesia/version.py,sha256=VkI5lk2CFatZR200RqGd8cBjTnMDmhtZW7DI6mPe6n4,25
+cartesia-0.0.5rc1.dist-info/METADATA,sha256=632D6iZ2IU3MLySAnMtwV2zQA38XkQv1rfFF4iRdAco,4893
+cartesia-0.0.5rc1.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
+cartesia-0.0.5rc1.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
+cartesia-0.0.5rc1.dist-info/RECORD,,

cartesia-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
-cartesia/tts.py,sha256=ABXW9rc8Pn0GTRvb_7DHZKMtbvhGUiqOgHmvztwlOnI,12033
-cartesia/version.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
-cartesia-0.0.3.dist-info/METADATA,sha256=VsCGL1sITbKqERihK2rzVm9WIY5EJ5nCS_CXQ0s14ns,3604
-cartesia-0.0.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-cartesia-0.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
-cartesia-0.0.3.dist-info/RECORD,,

{cartesia-0.0.3.dist-info → cartesia-0.0.5rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{cartesia-0.0.3.dist-info → cartesia-0.0.5rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

cartesia 0.0.3__py2.py3-none-any.whl → 0.0.5rc1__py2.py3-none-any.whl

cartesia 0.0.3py2.py3-none-any.whl → 0.0.5rc1py2.py3-none-any.whl