PyPI - cartesia - Versions diffs - 0.0.5rc1__py2.py3-none-any.whl → 0.1.0__py2.py3-none-any.whl - Mend

cartesia 0.0.5rc1py2.py3-none-any.whl → 0.1.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cartesia/__init__.py +2 -2
cartesia/_types.py +42 -0
cartesia/tts.py +319 -143
cartesia/utils.py +87 -0
cartesia/version.py +1 -1
{cartesia-0.0.5rc1.dist-info → cartesia-0.1.0.dist-info}/METADATA +77 -38
cartesia-0.1.0.dist-info/RECORD +9 -0
{cartesia-0.0.5rc1.dist-info → cartesia-0.1.0.dist-info}/WHEEL +1 -1
cartesia-0.0.5rc1.dist-info/RECORD +0 -7
{cartesia-0.0.5rc1.dist-info → cartesia-0.1.0.dist-info}/top_level.txt +0 -0

cartesia/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from cartesia.tts import CartesiaTTS
+from cartesia.tts import AsyncCartesiaTTS, CartesiaTTS
-__all__ = ["CartesiaTTS"]
+__all__ = ["CartesiaTTS", "AsyncCartesiaTTS"]

cartesia/_types.py ADDED Viewed

@@ -0,0 +1,42 @@
+from enum import Enum
+from typing import List, Optional, TypedDict, Union
+try:
+    import numpy as np
+    _NUMPY_AVAILABLE = True
+except ImportError:
+    _NUMPY_AVAILABLE = False
+class AudioDataReturnType(Enum):
+    BYTES = "bytes"
+    ARRAY = "array"
+class AudioOutputFormat(Enum):
+    """Supported output formats for the audio."""
+    FP32 = "fp32"  # float32
+    PCM = "pcm"  # 16-bit signed integer PCM
+    FP32_16000 = "fp32_16000"  # float32, 16 kHz
+    FP32_22050 = "fp32_22050"  # float32, 22.05 kHz
+    FP32_44100 = "fp32_44100"  # float32, 44.1 kHz
+    PCM_16000 = "pcm_16000"  # 16-bit signed integer PCM, 16 kHz
+    PCM_22050 = "pcm_22050"  # 16-bit signed integer PCM, 22.05 kHz
+    PCM_44100 = "pcm_44100"  # 16-bit signed integer PCM, 44.1 kHz
+class AudioOutput(TypedDict):
+    audio: Union[bytes, "np.ndarray"]
+    sampling_rate: int
+Embedding = List[float]
+class VoiceMetadata(TypedDict):
+    id: str
+    name: str
+    description: str
+    embedding: Optional[Embedding]

cartesia/tts.py CHANGED Viewed

@@ -3,33 +3,52 @@ import base64
 import json
 import os
 import uuid
-from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
+from types import TracebackType
+from typing import (
+    Any,
+    AsyncGenerator,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
 import aiohttp
 import httpx
+import logging
 import requests
 from websockets.sync.client import connect
-DEFAULT_MODEL_ID = "genial-planet-1346"
-DEFAULT_BASE_URL = "api.cartesia.ai"
-DEFAULT_API_VERSION = "v0"
-DEFAULT_TIMEOUT = 60  # seconds
-DEFAULT_NUM_CONNECTIONS = 10  # connections per client
+from cartesia.utils import retry_on_connection_error, retry_on_connection_error_async
+from cartesia._types import (
+    AudioDataReturnType,
+    AudioOutputFormat,
+    AudioOutput,
+    Embedding,
+    VoiceMetadata,
+)
+try:
+    import numpy as np
-class AudioOutput(TypedDict):
-    audio: bytes
-    sampling_rate: int
+    _NUMPY_AVAILABLE = True
+except ImportError:
+    _NUMPY_AVAILABLE = False
-Embedding = List[float]
+DEFAULT_MODEL_ID = ""
+DEFAULT_BASE_URL = "api.cartesia.ai"
+DEFAULT_API_VERSION = "v0"
+DEFAULT_TIMEOUT = 30  # seconds
+DEFAULT_NUM_CONNECTIONS = 10  # connections per client
+BACKOFF_FACTOR = 1
+MAX_RETRIES = 3
-class VoiceMetadata(TypedDict):
-    id: str
-    name: str
-    description: str
-    embedding: Optional[Embedding]
+logger = logging.getLogger(__name__)
 def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
@@ -71,10 +90,8 @@ class CartesiaTTS:
     and generate speech from text.
     The client also supports generating audio using a websocket for lower latency.
-    To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
     Examples:
         >>> client = CartesiaTTS()
         # Load available voices and their metadata (excluding the embeddings).
@@ -95,23 +112,17 @@ class CartesiaTTS:
         ...     audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
     """
-    def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
-        """
-        Args:
-            api_key: The API key to use for authorization.
-                If not specified, the API key will be read from the environment variable
-                `CARTESIA_API_KEY`.
-            experimental_ws_handle_interrupts: Whether to handle interrupts when generating
-                audio using the websocket. This is an experimental feature and may have bugs
-                or be deprecated in the future.
+    def __init__(self, *, api_key: str = None):
+        """Args:
+        api_key: The API key to use for authorization.
+            If not specified, the API key will be read from the environment variable
+            `CARTESIA_API_KEY`.
         """
         self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
         self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
         self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
         self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
         self.websocket = None
-        self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
-        self.refresh_websocket()
     def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
         """Returns a mapping from voice name -> voice metadata.
@@ -144,18 +155,25 @@ class CartesiaTTS:
             >>> audio = client.generate(transcript="Hello world!", voice=embedding)
         """
         params = {"select": "id, name, description"} if skip_embeddings else None
-        response = httpx.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
+        response = httpx.get(
+            f"{self._http_url()}/voices",
+            headers=self.headers,
+            params=params,
+            timeout=DEFAULT_TIMEOUT,
+        )
         if not response.is_success:
             raise ValueError(f"Failed to get voices. Error: {response.text}")
         voices = response.json()
-        # TODO: Update the API to return the embedding as a list of floats rather than string.
-        if not skip_embeddings:
-            for voice in voices:
+        for voice in voices:
+            if "embedding" in voice and isinstance(voice["embedding"], str):
                 voice["embedding"] = json.loads(voice["embedding"])
         return {voice["name"]: voice for voice in voices}
+    @retry_on_connection_error(
+        max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
+    )
     def get_voice_embedding(
         self, *, voice_id: str = None, filepath: str = None, link: str = None
     ) -> Embedding:
@@ -178,18 +196,18 @@ class CartesiaTTS:
         if voice_id:
             url = f"{self._http_url()}/voices/embedding/{voice_id}"
-            response = httpx.get(url, headers=self.headers)
+            response = httpx.get(url, headers=self.headers, timeout=DEFAULT_TIMEOUT)
         elif filepath:
             url = f"{self._http_url()}/voices/clone/clip"
             files = {"clip": open(filepath, "rb")}
             headers = self.headers.copy()
             # The default content type of JSON is incorrect for file uploads
             headers.pop("Content-Type")
-            response = httpx.post(url, headers=headers, files=files)
+            response = httpx.post(url, headers=headers, files=files, timeout=DEFAULT_TIMEOUT)
         elif link:
             url = f"{self._http_url()}/voices/clone/url"
             params = {"link": link}
-            response = httpx.post(url, headers=self.headers, params=params)
+            response = httpx.post(url, headers=self.headers, params=params, timeout=DEFAULT_TIMEOUT)
         if not response.is_success:
             raise ValueError(
@@ -199,9 +217,10 @@ class CartesiaTTS:
         # Handle successful response
         out = response.json()
-        if isinstance(out["embedding"], str):
-            out["embedding"] = json.loads(out["embedding"])
-        return out["embedding"]
+        embedding = out["embedding"]
+        if isinstance(embedding, str):
+            embedding = json.loads(embedding)
+        return embedding
     def refresh_websocket(self):
         """Refresh the websocket connection.
@@ -209,22 +228,30 @@ class CartesiaTTS:
         Note:
             The connection is synchronous.
         """
-        if self.websocket and not self._is_websocket_closed():
-            self.websocket.close()
-        route = "audio/websocket"
-        if self.experimental_ws_handle_interrupts:
-            route = f"experimental/{route}"
-        self.websocket = connect(
-            f"{self._ws_url()}/{route}?api_key={self.api_key}",
-            close_timeout=None,
-        )
+        if self.websocket is None or self._is_websocket_closed():
+            route = "audio/websocket"
+            self.websocket = connect(f"{self._ws_url()}/{route}?api_key={self.api_key}")
     def _is_websocket_closed(self):
         return self.websocket.socket.fileno() == -1
     def _check_inputs(
-        self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
+        self,
+        transcript: str,
+        duration: Optional[float],
+        chunk_time: Optional[float],
+        output_format: Union[str, AudioOutputFormat],
+        data_rtype: Union[str, AudioDataReturnType],
     ):
+        # This will try the casting and raise an error.
+        _ = AudioOutputFormat(output_format)
+        if AudioDataReturnType(data_rtype) == AudioDataReturnType.ARRAY and not _NUMPY_AVAILABLE:
+            raise ImportError(
+                "The 'numpy' package is required to use the 'array' return type. "
+                "Please install 'numpy' or use 'bytes' as the return type."
+            )
         if chunk_time is not None:
             if chunk_time < 0.1 or chunk_time > 0.5:
                 raise ValueError("`chunk_time` must be between 0.1 and 0.5")
@@ -240,20 +267,24 @@ class CartesiaTTS:
         self,
         *,
         transcript: str,
+        voice: Embedding,
+        model_id: str,
+        output_format: AudioOutputFormat,
         duration: int = None,
         chunk_time: float = None,
-        voice: Embedding = None,
     ) -> Dict[str, Any]:
+        """Create the request body for a stream request.
+        Note that anything that's not provided will use a default if available or be
+        filtered out otherwise.
         """
-        Create the request body for a stream request.
-        Note that anything that's not provided will use a default if available or be filtered out otherwise.
-        """
-        body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=voice)
+        body = dict(transcript=transcript, model_id=model_id, voice=voice)
+        output_format = output_format.value
         optional_body = dict(
             duration=duration,
             chunk_time=chunk_time,
-            voice=voice,
+            output_format=output_format,
         )
         body.update({k: v for k, v in optional_body.items() if v is not None})
@@ -263,25 +294,30 @@ class CartesiaTTS:
         self,
         *,
         transcript: str,
+        voice: Embedding,
+        model_id: str = DEFAULT_MODEL_ID,
         duration: int = None,
         chunk_time: float = None,
-        voice: Embedding = None,
         stream: bool = False,
         websocket: bool = True,
+        output_format: Union[str, AudioOutputFormat] = "fp32",
+        data_rtype: str = "bytes",
     ) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
         """Generate audio from a transcript.
         Args:
-            transcript: The text to generate audio for.
-            duration: The maximum duration of the audio in seconds.
-            chunk_time: How long each audio segment should be in seconds.
+            transcript (str): The text to generate audio for.
+            voice (Embedding (List[float])): The voice to use for generating audio.
+            duration (int, optional): The maximum duration of the audio in seconds.
+            chunk_time (float, optional): How long each audio segment should be in seconds.
                 This should not need to be adjusted.
-            voice: The voice to use for generating audio.
-                This can either be a voice id (string) or an embedding vector (List[float]).
-            stream: Whether to stream the audio or not.
-                If ``True`` this function returns a generator.
-            websocket: Whether to use a websocket for streaming audio.
-                Using the websocket reduces latency by pre-poning the handshake.
+            stream (bool, optional): Whether to stream the audio or not.
+                If True this function returns a generator. False by default.
+            websocket (bool, optional): Whether to use a websocket for streaming audio.
+                Using the websocket reduces latency by pre-poning the handshake. True by default.
+            data_rtype: The return type for the 'data' key in the dictionary.
+                One of `'byte' | 'array'`.
+                Note this field is experimental and may be deprecated in the future.
         Returns:
             A generator if `stream` is True, otherwise a dictionary.
@@ -289,17 +325,28 @@ class CartesiaTTS:
                 * "audio": The audio as a bytes buffer.
                 * "sampling_rate": The sampling rate of the audio.
         """
-        self._check_inputs(transcript, duration, chunk_time)
+        self._check_inputs(transcript, duration, chunk_time, output_format, data_rtype)
+        data_rtype = AudioDataReturnType(data_rtype)
+        output_format = AudioOutputFormat(output_format)
         body = self._generate_request_body(
-            transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
+            transcript=transcript,
+            voice=voice,
+            model_id=model_id,
+            duration=duration,
+            chunk_time=chunk_time,
+            output_format=output_format,
         )
         if websocket:
             generator = self._generate_ws(body)
         else:
-            generator = self._generate_http(body)
+            generator = self._generate_http_wrapper(body)
+        generator = self._postprocess_audio(
+            generator, data_rtype=data_rtype, output_format=output_format
+        )
         if stream:
             return generator
@@ -310,14 +357,61 @@ class CartesiaTTS:
                 sampling_rate = chunk["sampling_rate"]
             chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
+        if data_rtype == AudioDataReturnType.ARRAY:
+            cat = np.concatenate
+        else:
+            cat = b"".join
+        return {"audio": cat(chunks), "sampling_rate": sampling_rate}
+    def _postprocess_audio(
+        self,
+        generator: Generator[AudioOutput, None, None],
+        *,
+        data_rtype: AudioDataReturnType,
+        output_format: AudioOutputFormat,
+    ) -> Generator[AudioOutput, None, None]:
+        """Perform postprocessing on the generator outputs.
+        The postprocessing should be minimal (e.g. converting to array, casting dtype).
+        This code should not perform heavy operations like changing the sampling rate.
+        Args:
+            generator: A generator that yields audio chunks.
+            data_rtype: The data return type.
+            output_format: The output format for the audio.
+        Returns:
+            A generator that yields audio chunks.
+        """
+        dtype = None
+        if data_rtype == AudioDataReturnType.ARRAY:
+            dtype = np.float32 if "fp32" in output_format.value else np.int16
+        for chunk in generator:
+            if dtype is not None:
+                chunk["audio"] = np.frombuffer(chunk["audio"], dtype=dtype)
+            yield chunk
+    @retry_on_connection_error(
+        max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
+    )
+    def _generate_http_wrapper(self, body: Dict[str, Any]):
+        """Need to wrap the http generator in a function for the retry decorator to work."""
+        try:
+            for chunk in self._generate_http(body):
+                yield chunk
+        except Exception as e:
+            logger.error(f"Failed to generate audio. {e}")
+            raise e
     def _generate_http(self, body: Dict[str, Any]):
         response = requests.post(
-            f"{self._http_url()}/audio/stream",
+            f"{self._http_url()}/audio/sse",
             stream=True,
             data=json.dumps(body),
             headers=self.headers,
+            timeout=(DEFAULT_TIMEOUT, DEFAULT_TIMEOUT),
         )
         if not response.ok:
             raise ValueError(f"Failed to generate audio. {response.text}")
@@ -356,21 +450,33 @@ class CartesiaTTS:
         try:
             while True:
                 response = json.loads(self.websocket.recv())
+                if "error" in response:
+                    raise RuntimeError(f"Error generating audio:\n{response['error']}")
                 if response["done"]:
                     break
                 yield convert_response(response, include_context_id)
-                if self.experimental_ws_handle_interrupts:
-                    self.websocket.send(json.dumps({"context_id": context_id}))
-        except GeneratorExit:
-            # The exit is only called when the generator is garbage collected.
-            # It may not be called directly after a break statement.
-            # However, the generator will be automatically cancelled on the next request.
-            if self.experimental_ws_handle_interrupts:
-                self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
         except Exception as e:
+            # Close the websocket connection if an error occurs.
+            if self.websocket and not self._is_websocket_closed():
+                self.websocket.close()
             raise RuntimeError(f"Failed to generate audio. {response}") from e
+        finally:
+            # Ensure the websocket is ultimately closed.
+            if self.websocket and not self._is_websocket_closed():
+                self.websocket.close()
+    def prepare_audio_and_headers(
+        self, raw_audio: Union[bytes, str]
+    ) -> Tuple[bytes, Dict[str, Any]]:
+        if isinstance(raw_audio, str):
+            with open(raw_audio, "rb") as f:
+                raw_audio_bytes = f.read()
+        else:
+            raw_audio_bytes = raw_audio
+        # application/json is not the right content type for this request
+        headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
+        return raw_audio_bytes, headers
     def _http_url(self):
         prefix = "http" if "localhost" in self.base_url else "https"
@@ -380,64 +486,103 @@ class CartesiaTTS:
         prefix = "ws" if "localhost" in self.base_url else "wss"
         return f"{prefix}://{self.base_url}/{self.api_version}"
-    def __del__(self):
-        if self.websocket.socket.fileno() > -1:
+    def close(self):
+        if self.websocket and not self._is_websocket_closed():
             self.websocket.close()
+    def __del__(self):
+        self.close()
-class AsyncCartesiaTTS(CartesiaTTS):
-    def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
-        self.timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
-        self.connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
-        self._session = aiohttp.ClientSession(timeout=self.timeout, connector=self.connector)
-        super().__init__(
-            api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
-        )
+    def __enter__(self):
+        self.refresh_websocket()
+        return self
+    def __exit__(
+        self,
+        exc_type: Union[type, None],
+        exc: Union[BaseException, None],
+        exc_tb: Union[TracebackType, None],
+    ):
+        self.close()
-    def refresh_websocket(self):
-        pass  # do not load the websocket for the client until asynchronously when it is needed
-    async def _async_refresh_websocket(self):
+class AsyncCartesiaTTS(CartesiaTTS):
+    def __init__(self, *, api_key: str = None):
+        self._session = None
+        self._loop = None
+        super().__init__(api_key=api_key)
+    async def _get_session(self):
+        current_loop = asyncio.get_event_loop()
+        if self._loop is not current_loop:
+            # If the loop has changed, close the session and create a new one.
+            await self.close()
+        if self._session is None or self._session.closed:
+            timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
+            connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
+            self._session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+            self._loop = current_loop
+        return self._session
+    async def refresh_websocket(self):
         """Refresh the websocket connection."""
-        if self.websocket and not self._is_websocket_closed():
-            self.websocket.close()
-        route = "audio/websocket"
-        if self.experimental_ws_handle_interrupts:
-            route = f"experimental/{route}"
-        self.websocket = await self._session.ws_connect(
-            f"{self._ws_url()}/{route}?api_key={self.api_key}"
-        )
+        if self.websocket is None or self._is_websocket_closed():
+            route = "audio/websocket"
+            session = await self._get_session()
+            self.websocket = await session.ws_connect(
+                f"{self._ws_url()}/{route}?api_key={self.api_key}"
+            )
+    def _is_websocket_closed(self):
+        return self.websocket.closed
+    async def close(self):
+        """This method closes the websocket and the session.
+        It is *strongly* recommended to call this method when you are done using the client.
+        """
+        if self.websocket is not None and not self._is_websocket_closed():
+            await self.websocket.close()
+        if self._session is not None and not self._session.closed:
+            await self._session.close()
     async def generate(
         self,
         *,
         transcript: str,
+        voice: Embedding,
+        model_id: str = DEFAULT_MODEL_ID,
         duration: int = None,
         chunk_time: float = None,
-        voice: Embedding = None,
         stream: bool = False,
         websocket: bool = True,
+        output_format: Union[str, AudioOutputFormat] = "fp32",
+        data_rtype: Union[str, AudioDataReturnType] = "bytes",
     ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
         """Asynchronously generate audio from a transcript.
-        NOTE: This overrides the non-asynchronous generate method from the base class.
-        Args:
-            transcript: The text to generate audio for.
-            voice: The embedding to use for generating audio.
-            options: The options to use for generating audio. See :class:`GenerateOptions`.
-        Returns:
-            A dictionary containing the following:
-                * "audio": The audio as a 1D numpy array.
-                * "sampling_rate": The sampling rate of the audio.
+        For more information on the arguments, see the synchronous :meth:`CartesiaTTS.generate`.
         """
+        self._check_inputs(transcript, duration, chunk_time, output_format, data_rtype)
+        data_rtype = AudioDataReturnType(data_rtype)
+        output_format = AudioOutputFormat(output_format)
         body = self._generate_request_body(
-            transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
+            transcript=transcript,
+            voice=voice,
+            model_id=model_id,
+            duration=duration,
+            chunk_time=chunk_time,
+            output_format=output_format,
         )
         if websocket:
             generator = self._generate_ws(body)
         else:
-            generator = self._generate_http(body)
+            generator = self._generate_http_wrapper(body)
+        generator = self._postprocess_audio(
+            generator, data_rtype=data_rtype, output_format=output_format
+        )
         if stream:
             return generator
@@ -448,14 +593,49 @@ class AsyncCartesiaTTS(CartesiaTTS):
                 sampling_rate = chunk["sampling_rate"]
             chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
+        if data_rtype == AudioDataReturnType.ARRAY:
+            cat = np.concatenate
+        else:
+            cat = b"".join
+        return {"audio": cat(chunks), "sampling_rate": sampling_rate}
+    async def _postprocess_audio(
+        self,
+        generator: AsyncGenerator[AudioOutput, None],
+        *,
+        data_rtype: AudioDataReturnType,
+        output_format: AudioOutputFormat,
+    ) -> AsyncGenerator[AudioOutput, None]:
+        """See :meth:`CartesiaTTS._postprocess_audio`."""
+        dtype = None
+        if data_rtype == AudioDataReturnType.ARRAY:
+            dtype = np.float32 if "fp32" in output_format.value else np.int16
+        async for chunk in generator:
+            if dtype is not None:
+                chunk["audio"] = np.frombuffer(chunk["audio"], dtype=dtype)
+            yield chunk
+    @retry_on_connection_error_async(
+        max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
+    )
+    async def _generate_http_wrapper(self, body: Dict[str, Any]):
+        """Need to wrap the http generator in a function for the retry decorator to work."""
+        try:
+            async for chunk in self._generate_http(body):
+                yield chunk
+        except Exception as e:
+            logger.error(f"Failed to generate audio. {e}")
+            raise e
     async def _generate_http(self, body: Dict[str, Any]):
-        async with self._session.post(
-            f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
+        session = await self._get_session()
+        async with session.post(
+            f"{self._http_url()}/audio/sse", data=json.dumps(body), headers=self.headers
         ) as response:
-            if response.status < 200 or response.status >= 300:
-                raise ValueError(f"Failed to generate audio. {response.text}")
+            if not response.ok:
+                raise ValueError(f"Failed to generate audio. {await response.text()}")
             buffer = ""
             async for chunk_bytes in response.content.iter_any():
@@ -473,12 +653,8 @@ class AsyncCartesiaTTS(CartesiaTTS):
     async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
         include_context_id = bool(context_id)
-        route = "audio/websocket"
-        if self.experimental_ws_handle_interrupts:
-            route = f"experimental/{route}"
         if not self.websocket or self._is_websocket_closed():
-            await self._async_refresh_websocket()
+            await self.refresh_websocket()
         ws = self.websocket
         if context_id is None:
@@ -492,26 +668,14 @@ class AsyncCartesiaTTS(CartesiaTTS):
                     break
                 yield convert_response(response, include_context_id)
-                if self.experimental_ws_handle_interrupts:
-                    await ws.send_json({"context_id": context_id})
-        except GeneratorExit:
-            # The exit is only called when the generator is garbage collected.
-            # It may not be called directly after a break statement.
-            # However, the generator will be automatically cancelled on the next request.
-            if self.experimental_ws_handle_interrupts:
-                await ws.send_json({"context_id": context_id, "action": "cancel"})
         except Exception as e:
-            raise RuntimeError(f"Failed to generate audio. {response}") from e
-    def _is_websocket_closed(self):
-        return self.websocket.closed
-    async def cleanup(self):
-        if self.websocket is not None and not self._is_websocket_closed():
-            await self.websocket.close()
-        if not self._session.closed:
-            await self._session.close()
+            if self.websocket and not self._is_websocket_closed():
+                await self.websocket.close()
+            raise RuntimeError(f"Failed to generate audio. {await response.text()}") from e
+        finally:
+            # Ensure the websocket is ultimately closed.
+            if self.websocket and not self._is_websocket_closed():
+                await self.websocket.close()
     def __del__(self):
         try:
@@ -520,6 +684,18 @@ class AsyncCartesiaTTS(CartesiaTTS):
             loop = None
         if loop is None:
-            asyncio.run(self.cleanup())
+            asyncio.run(self.close())
         else:
-            loop.create_task(self.cleanup())
+            loop.create_task(self.close())
+    async def __aenter__(self):
+        await self.refresh_websocket()
+        return self
+    async def __aexit__(
+        self,
+        exc_type: Union[type, None],
+        exc: Union[BaseException, None],
+        exc_tb: Union[TracebackType, None],
+    ):
+        await self.close()

cartesia/utils.py ADDED Viewed

@@ -0,0 +1,87 @@
+import time
+from aiohttp.client_exceptions import ServerDisconnectedError
+import asyncio
+from functools import wraps
+from http.client import RemoteDisconnected
+from httpx import TimeoutException
+from requests.exceptions import ConnectionError
+def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
+    """Retry a function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
+    Args:
+        max_retries (int): The maximum number of retries.
+        backoff_factor (int): The factor to increase the delay between retries.
+        logger (logging.Logger): The logger to use for logging.
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            retry_count = 0
+            while retry_count < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except (
+                    ConnectionError,
+                    RemoteDisconnected,
+                    ServerDisconnectedError,
+                    TimeoutException,
+                ) as e:
+                    logger.info(f"Retrying after exception: {e}")
+                    retry_count += 1
+                    if retry_count < max_retries:
+                        delay = backoff_factor * (2 ** (retry_count - 1))
+                        logger.warn(
+                            f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds..."
+                        )
+                        time.sleep(delay)
+                    else:
+                        raise Exception(f"Exception occurred after {max_retries} tries.") from e
+        return wrapper
+    return decorator
+def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None):
+    """Retry an asynchronous function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
+    Args:
+        max_retries (int): The maximum number of retries.
+        backoff_factor (int): The factor to increase the delay between retries.
+        logger (logging.Logger): The logger to use for logging.
+    """
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            retry_count = 0
+            while retry_count < max_retries:
+                try:
+                    async for chunk in func(*args, **kwargs):
+                        yield chunk
+                    # If the function completes without raising an exception return
+                    return
+                except (
+                    ConnectionError,
+                    RemoteDisconnected,
+                    ServerDisconnectedError,
+                    TimeoutException,
+                ) as e:
+                    logger.info(f"Retrying after exception: {e}")
+                    retry_count += 1
+                    if retry_count < max_retries:
+                        delay = backoff_factor * (2 ** (retry_count - 1))
+                        logger.warn(
+                            f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds..."
+                        )
+                        await asyncio.sleep(delay)
+                    else:
+                        raise Exception(f"Exception occurred after {max_retries} tries.") from e
+        return wrapper
+    return decorator

cartesia/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0~~.5rc1~~"
1	+ __version__ = "0.1.0"

{cartesia-0.0.5rc1.dist-info → cartesia-0.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 0.0.5rc1
+Version: 0.1.0
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.
@@ -16,25 +16,19 @@ Requires-Dist: pytest-asyncio
 Requires-Dist: requests
 Requires-Dist: websockets
 Provides-Extra: all
-Requires-Dist: pre-commit ; extra == 'all'
-Requires-Dist: docformatter ; extra == 'all'
-Requires-Dist: black ==24.1.1 ; extra == 'all'
-Requires-Dist: isort ==5.13.2 ; extra == 'all'
-Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
-Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
 Requires-Dist: pytest >=8.0.2 ; extra == 'all'
 Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
 Requires-Dist: twine ; extra == 'all'
+Requires-Dist: setuptools ; extra == 'all'
+Requires-Dist: wheel ; extra == 'all'
+Requires-Dist: numpy ; extra == 'all'
 Provides-Extra: dev
-Requires-Dist: pre-commit ; extra == 'dev'
-Requires-Dist: docformatter ; extra == 'dev'
-Requires-Dist: black ==24.1.1 ; extra == 'dev'
-Requires-Dist: isort ==5.13.2 ; extra == 'dev'
-Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
-Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
 Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
 Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
 Requires-Dist: twine ; extra == 'dev'
+Requires-Dist: setuptools ; extra == 'dev'
+Requires-Dist: wheel ; extra == 'dev'
+Requires-Dist: numpy ; extra == 'dev'
 # Cartesia Python API Library
@@ -60,13 +54,14 @@ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
 voices = client.get_voices()
 voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
 transcript = "Hello! Welcome to Cartesia"
+model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
 p = pyaudio.PyAudio()
 stream = None
 # Generate and stream audio
-for output in client.generate(transcript=transcript, voice=voice, stream=True):
+for output in client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
     buffer = output["audio"]
     rate = output["sampling_rate"]
@@ -84,26 +79,68 @@ stream.close()
 p.terminate()
 ```
-If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
+You can also use the async client if you want to make asynchronous API calls:
+```python
+from cartesia.tts import AsyncCartesiaTTS
+import asyncio
+import pyaudio
+import os
+async def write_stream():
+    client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
+    voices = client.get_voices()
+    voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
+    transcript = "Hello! Welcome to Cartesia"
+    model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
+    p = pyaudio.PyAudio()
+    stream = None
+    # Generate and stream audio
+    async for output in await client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
+        buffer = output["audio"]
+        rate = output["sampling_rate"]
+        if not stream:
+            stream = p.open(format=pyaudio.paFloat32,
+                            channels=1,
+                            rate=rate,
+                            output=True)
+        # Write the audio data to the stream
+        stream.write(buffer)
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+asyncio.run(write_stream())
+```
+If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
+Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
 ```python
-from cartesia.tts import CartesiaTTS
 from IPython.display import Audio
 import io
 import os
+import numpy as np
-client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
-voices = client.get_voices()
-voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
-transcript = "Hello! Welcome to Cartesia"
+from cartesia.tts import CartesiaTTS
-# Create a BytesIO object to store the audio data
-audio_data = io.BytesIO()
+with CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
+    voices = client.get_voices()
+    voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
+    transcript = "Hello! Welcome to Cartesia"
-# Generate and stream audio
-for output in client.generate(transcript=transcript, voice=voice, stream=True):
-    buffer = output["audio"]
-    audio_data.write(buffer)
+    # Create a BytesIO object to store the audio data
+    audio_data = io.BytesIO()
+    # Generate and stream audio
+    for output in client.generate(transcript=transcript, voice=voice, stream=True):
+        buffer = output["audio"]
+        audio_data.write(buffer)
 # Set the cursor position to the beginning of the BytesIO object
 audio_data.seek(0)
@@ -115,25 +152,27 @@ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["s
 display(audio)
 ```
-You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
+Below is the same example using the async client:
 ```python
-from cartesia.tts import AsyncCartesiaTTS
 from IPython.display import Audio
 import io
 import os
+import numpy as np
-client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
-voices = client.get_voices()
-voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
-transcript = "Hello! Welcome to Cartesia"
+from cartesia.tts import AsyncCartesiaTTS
-# Create a BytesIO object to store the audio data
-audio_data = io.BytesIO()
+async with AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
+    voices = client.get_voices()
+    voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
+    transcript = "Hello! Welcome to Cartesia"
-# Generate and stream audio
-async for output in client.generate(transcript=transcript, voice=voice, stream=True):
-    buffer = output["audio"]
-    audio_data.write(buffer)
+    # Create a BytesIO object to store the audio data
+    audio_data = io.BytesIO()
+    # Generate and stream audio
+    async for output in await client.generate(transcript=transcript, voice=voice, stream=True):
+        buffer = output["audio"]
+        audio_data.write(buffer)
 # Set the cursor position to the beginning of the BytesIO object
 audio_data.seek(0)

cartesia-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+cartesia/__init__.py,sha256=uIc9xGNPs8_A6eAvbTUY1geazunYoEZVWFKhCwC9TRA,102
+cartesia/_types.py,sha256=uf2Pe-9g7nU-RNUxNAFN3j5Cwy0WyLP1oZf6VV5rGgw,1001
+cartesia/tts.py,sha256=hAADPdTYu7yGsY7yIQIf1hjKKJLUk9pm5LU0cEIB8gA,25806
+cartesia/utils.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
+cartesia/version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
+cartesia-0.1.0.dist-info/METADATA,sha256=H7spLdviK35R839_OAB47JL2FAaGw6AZ7CnNs_xy87Q,6050
+cartesia-0.1.0.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
+cartesia-0.1.0.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
+cartesia-0.1.0.dist-info/RECORD,,

{cartesia-0.0.5rc1.dist-info → cartesia-0.1.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.41.2)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py2-none-any
 Tag: py3-none-any

cartesia-0.0.5rc1.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
-cartesia/tts.py,sha256=yPLz41AR0oAYPUNW48mqmwEEbLBHCnbaK_wPT0iFBVk,20543
-cartesia/version.py,sha256=VkI5lk2CFatZR200RqGd8cBjTnMDmhtZW7DI6mPe6n4,25
-cartesia-0.0.5rc1.dist-info/METADATA,sha256=632D6iZ2IU3MLySAnMtwV2zQA38XkQv1rfFF4iRdAco,4893
-cartesia-0.0.5rc1.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
-cartesia-0.0.5rc1.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
-cartesia-0.0.5rc1.dist-info/RECORD,,

{cartesia-0.0.5rc1.dist-info → cartesia-0.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

cartesia 0.0.5rc1__py2.py3-none-any.whl → 0.1.0__py2.py3-none-any.whl

cartesia 0.0.5rc1py2.py3-none-any.whl → 0.1.0py2.py3-none-any.whl