PyPI - cartesia - Versions diffs - 0.0.6__tar.gz → 0.1.0__tar.gz - Mend

cartesia 0.0.6tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{cartesia-0.0.6 → cartesia-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 0.0.6
+Version: 0.1.0
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.

cartesia-0.1.0/cartesia/_types.py ADDED Viewed

@@ -0,0 +1,42 @@
+from enum import Enum
+from typing import List, Optional, TypedDict, Union
+try:
+    import numpy as np
+    _NUMPY_AVAILABLE = True
+except ImportError:
+    _NUMPY_AVAILABLE = False
+class AudioDataReturnType(Enum):
+    BYTES = "bytes"
+    ARRAY = "array"
+class AudioOutputFormat(Enum):
+    """Supported output formats for the audio."""
+    FP32 = "fp32"  # float32
+    PCM = "pcm"  # 16-bit signed integer PCM
+    FP32_16000 = "fp32_16000"  # float32, 16 kHz
+    FP32_22050 = "fp32_22050"  # float32, 22.05 kHz
+    FP32_44100 = "fp32_44100"  # float32, 44.1 kHz
+    PCM_16000 = "pcm_16000"  # 16-bit signed integer PCM, 16 kHz
+    PCM_22050 = "pcm_22050"  # 16-bit signed integer PCM, 22.05 kHz
+    PCM_44100 = "pcm_44100"  # 16-bit signed integer PCM, 44.1 kHz
+class AudioOutput(TypedDict):
+    audio: Union[bytes, "np.ndarray"]
+    sampling_rate: int
+Embedding = List[float]
+class VoiceMetadata(TypedDict):
+    id: str
+    name: str
+    description: str
+    embedding: Optional[Embedding]

{cartesia-0.0.6 → cartesia-0.1.0}/cartesia/tts.py RENAMED Viewed

@@ -4,7 +4,17 @@ import json
 import os
 import uuid
 from types import TracebackType
-from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
+from typing import (
+    Any,
+    AsyncGenerator,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
 import aiohttp
 import httpx
@@ -13,6 +23,21 @@ import requests
 from websockets.sync.client import connect
 from cartesia.utils import retry_on_connection_error, retry_on_connection_error_async
+from cartesia._types import (
+    AudioDataReturnType,
+    AudioOutputFormat,
+    AudioOutput,
+    Embedding,
+    VoiceMetadata,
+)
+try:
+    import numpy as np
+    _NUMPY_AVAILABLE = True
+except ImportError:
+    _NUMPY_AVAILABLE = False
 DEFAULT_MODEL_ID = ""
 DEFAULT_BASE_URL = "api.cartesia.ai"
@@ -25,20 +50,6 @@ MAX_RETRIES = 3
 logger = logging.getLogger(__name__)
-class AudioOutput(TypedDict):
-    audio: bytes
-    sampling_rate: int
-Embedding = List[float]
-class VoiceMetadata(TypedDict):
-    id: str
-    name: str
-    description: str
-    embedding: Optional[Embedding]
 def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
     buffer += chunk_bytes.decode("utf-8")
@@ -79,7 +90,6 @@ class CartesiaTTS:
     and generate speech from text.
     The client also supports generating audio using a websocket for lower latency.
-    To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
     Examples:
         >>> client = CartesiaTTS()
@@ -102,21 +112,17 @@ class CartesiaTTS:
         ...     audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
     """
-    def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
+    def __init__(self, *, api_key: str = None):
         """Args:
         api_key: The API key to use for authorization.
             If not specified, the API key will be read from the environment variable
             `CARTESIA_API_KEY`.
-        experimental_ws_handle_interrupts: Whether to handle interrupts when generating
-            audio using the websocket. This is an experimental feature and may have bugs
-            or be deprecated in the future.
         """
         self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
         self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
         self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
         self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
         self.websocket = None
-        self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
     def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
         """Returns a mapping from voice name -> voice metadata.
@@ -165,7 +171,9 @@ class CartesiaTTS:
                 voice["embedding"] = json.loads(voice["embedding"])
         return {voice["name"]: voice for voice in voices}
-    @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
+    @retry_on_connection_error(
+        max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
+    )
     def get_voice_embedding(
         self, *, voice_id: str = None, filepath: str = None, link: str = None
     ) -> Embedding:
@@ -222,16 +230,28 @@ class CartesiaTTS:
         """
         if self.websocket is None or self._is_websocket_closed():
             route = "audio/websocket"
-            if self.experimental_ws_handle_interrupts:
-                route = f"experimental/{route}"
             self.websocket = connect(f"{self._ws_url()}/{route}?api_key={self.api_key}")
     def _is_websocket_closed(self):
         return self.websocket.socket.fileno() == -1
     def _check_inputs(
-        self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
+        self,
+        transcript: str,
+        duration: Optional[float],
+        chunk_time: Optional[float],
+        output_format: Union[str, AudioOutputFormat],
+        data_rtype: Union[str, AudioDataReturnType],
     ):
+        # This will try the casting and raise an error.
+        _ = AudioOutputFormat(output_format)
+        if AudioDataReturnType(data_rtype) == AudioDataReturnType.ARRAY and not _NUMPY_AVAILABLE:
+            raise ImportError(
+                "The 'numpy' package is required to use the 'array' return type. "
+                "Please install 'numpy' or use 'bytes' as the return type."
+            )
         if chunk_time is not None:
             if chunk_time < 0.1 or chunk_time > 0.5:
                 raise ValueError("`chunk_time` must be between 0.1 and 0.5")
@@ -249,7 +269,7 @@ class CartesiaTTS:
         transcript: str,
         voice: Embedding,
         model_id: str,
-        output_format: str,
+        output_format: AudioOutputFormat,
         duration: int = None,
         chunk_time: float = None,
     ) -> Dict[str, Any]:
@@ -259,6 +279,7 @@ class CartesiaTTS:
         filtered out otherwise.
         """
         body = dict(transcript=transcript, model_id=model_id, voice=voice)
+        output_format = output_format.value
         optional_body = dict(
             duration=duration,
@@ -279,7 +300,8 @@ class CartesiaTTS:
         chunk_time: float = None,
         stream: bool = False,
         websocket: bool = True,
-        output_format: str = "fp32",
+        output_format: Union[str, AudioOutputFormat] = "fp32",
+        data_rtype: str = "bytes",
     ) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
         """Generate audio from a transcript.
@@ -293,6 +315,9 @@ class CartesiaTTS:
                 If True this function returns a generator. False by default.
             websocket (bool, optional): Whether to use a websocket for streaming audio.
                 Using the websocket reduces latency by pre-poning the handshake. True by default.
+            data_rtype: The return type for the 'data' key in the dictionary.
+                One of `'byte' | 'array'`.
+                Note this field is experimental and may be deprecated in the future.
         Returns:
             A generator if `stream` is True, otherwise a dictionary.
@@ -300,13 +325,16 @@ class CartesiaTTS:
                 * "audio": The audio as a bytes buffer.
                 * "sampling_rate": The sampling rate of the audio.
         """
-        self._check_inputs(transcript, duration, chunk_time)
+        self._check_inputs(transcript, duration, chunk_time, output_format, data_rtype)
+        data_rtype = AudioDataReturnType(data_rtype)
+        output_format = AudioOutputFormat(output_format)
         body = self._generate_request_body(
-            transcript=transcript,
-            voice=voice,
+            transcript=transcript,
+            voice=voice,
             model_id=model_id,
-            duration=duration,
+            duration=duration,
             chunk_time=chunk_time,
             output_format=output_format,
         )
@@ -316,6 +344,9 @@ class CartesiaTTS:
         else:
             generator = self._generate_http_wrapper(body)
+        generator = self._postprocess_audio(
+            generator, data_rtype=data_rtype, output_format=output_format
+        )
         if stream:
             return generator
@@ -326,9 +357,45 @@ class CartesiaTTS:
                 sampling_rate = chunk["sampling_rate"]
             chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
+        if data_rtype == AudioDataReturnType.ARRAY:
+            cat = np.concatenate
+        else:
+            cat = b"".join
+        return {"audio": cat(chunks), "sampling_rate": sampling_rate}
+    def _postprocess_audio(
+        self,
+        generator: Generator[AudioOutput, None, None],
+        *,
+        data_rtype: AudioDataReturnType,
+        output_format: AudioOutputFormat,
+    ) -> Generator[AudioOutput, None, None]:
+        """Perform postprocessing on the generator outputs.
+        The postprocessing should be minimal (e.g. converting to array, casting dtype).
+        This code should not perform heavy operations like changing the sampling rate.
+        Args:
+            generator: A generator that yields audio chunks.
+            data_rtype: The data return type.
+            output_format: The output format for the audio.
+        Returns:
+            A generator that yields audio chunks.
+        """
+        dtype = None
+        if data_rtype == AudioDataReturnType.ARRAY:
+            dtype = np.float32 if "fp32" in output_format.value else np.int16
+        for chunk in generator:
+            if dtype is not None:
+                chunk["audio"] = np.frombuffer(chunk["audio"], dtype=dtype)
+            yield chunk
-    @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
+    @retry_on_connection_error(
+        max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
+    )
     def _generate_http_wrapper(self, body: Dict[str, Any]):
         """Need to wrap the http generator in a function for the retry decorator to work."""
         try:
@@ -389,15 +456,6 @@ class CartesiaTTS:
                     break
                 yield convert_response(response, include_context_id)
-                if self.experimental_ws_handle_interrupts:
-                    self.websocket.send(json.dumps({"context_id": context_id}))
-        except GeneratorExit:
-            # The exit is only called when the generator is garbage collected.
-            # It may not be called directly after a break statement.
-            # However, the generator will be automatically cancelled on the next request.
-            if self.experimental_ws_handle_interrupts:
-                self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
         except Exception as e:
             # Close the websocket connection if an error occurs.
             if self.websocket and not self._is_websocket_closed():
@@ -408,23 +466,6 @@ class CartesiaTTS:
             if self.websocket and not self._is_websocket_closed():
                 self.websocket.close()
-    @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
-    def transcribe(self, raw_audio: Union[bytes, str]) -> str:
-        raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
-        response = httpx.post(
-            f"{self._http_url()}/audio/transcriptions",
-            headers=headers,
-            files={"clip": ("input.wav", raw_audio_bytes)},
-            timeout=DEFAULT_TIMEOUT,
-        )
-        if not response.is_success:
-            raise ValueError(f"Failed to transcribe audio. Error: {response.text()}")
-        transcript = response.json()
-        return transcript["text"]
     def prepare_audio_and_headers(
         self, raw_audio: Union[bytes, str]
     ) -> Tuple[bytes, Dict[str, Any]]:
@@ -466,13 +507,11 @@ class CartesiaTTS:
 class AsyncCartesiaTTS(CartesiaTTS):
-    def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
+    def __init__(self, *, api_key: str = None):
         self._session = None
         self._loop = None
-        super().__init__(
-            api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
-        )
+        super().__init__(api_key=api_key)
     async def _get_session(self):
         current_loop = asyncio.get_event_loop()
         if self._loop is not current_loop:
@@ -481,29 +520,25 @@ class AsyncCartesiaTTS(CartesiaTTS):
         if self._session is None or self._session.closed:
             timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
             connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
-            self._session = aiohttp.ClientSession(
-                timeout=timeout, connector=connector
-            )
+            self._session = aiohttp.ClientSession(timeout=timeout, connector=connector)
             self._loop = current_loop
         return self._session
     async def refresh_websocket(self):
         """Refresh the websocket connection."""
         if self.websocket is None or self._is_websocket_closed():
             route = "audio/websocket"
-            if self.experimental_ws_handle_interrupts:
-                route = f"experimental/{route}"
             session = await self._get_session()
             self.websocket = await session.ws_connect(
                 f"{self._ws_url()}/{route}?api_key={self.api_key}"
             )
     def _is_websocket_closed(self):
         return self.websocket.closed
     async def close(self):
         """This method closes the websocket and the session.
         It is *strongly* recommended to call this method when you are done using the client.
         """
         if self.websocket is not None and not self._is_websocket_closed():
@@ -521,35 +556,22 @@ class AsyncCartesiaTTS(CartesiaTTS):
         chunk_time: float = None,
         stream: bool = False,
         websocket: bool = True,
-        output_format: str = "fp32"
+        output_format: Union[str, AudioOutputFormat] = "fp32",
+        data_rtype: Union[str, AudioDataReturnType] = "bytes",
     ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
         """Asynchronously generate audio from a transcript.
-        NOTE: This overrides the non-asynchronous generate method from the base class.
-        Args:
-            transcript (str): The text to generate audio for.
-            voice (Embedding (List[float])): The voice to use for generating audio.
-            duration (int, optional): The maximum duration of the audio in seconds.
-            chunk_time (float, optional): How long each audio segment should be in seconds.
-                This should not need to be adjusted.
-            stream (bool, optional): Whether to stream the audio or not.
-                If True this function returns a generator. False by default.
-            websocket (bool, optional): Whether to use a websocket for streaming audio.
-                Using the websocket reduces latency by pre-poning the handshake. True by default.
-        Returns:
-            A generator if `stream` is True, otherwise a dictionary.
-            Dictionary from both generator and non-generator return types have the following keys:
-                * "audio": The audio as a bytes buffer.
-                * "sampling_rate": The sampling rate of the audio.
+        For more information on the arguments, see the synchronous :meth:`CartesiaTTS.generate`.
         """
-        self._check_inputs(transcript, duration, chunk_time)
+        self._check_inputs(transcript, duration, chunk_time, output_format, data_rtype)
+        data_rtype = AudioDataReturnType(data_rtype)
+        output_format = AudioOutputFormat(output_format)
         body = self._generate_request_body(
-            transcript=transcript,
+            transcript=transcript,
             voice=voice,
             model_id=model_id,
-            duration=duration,
+            duration=duration,
             chunk_time=chunk_time,
             output_format=output_format,
         )
@@ -558,7 +580,9 @@ class AsyncCartesiaTTS(CartesiaTTS):
             generator = self._generate_ws(body)
         else:
             generator = self._generate_http_wrapper(body)
+        generator = self._postprocess_audio(
+            generator, data_rtype=data_rtype, output_format=output_format
+        )
         if stream:
             return generator
@@ -569,14 +593,38 @@ class AsyncCartesiaTTS(CartesiaTTS):
                 sampling_rate = chunk["sampling_rate"]
             chunks.append(chunk["audio"])
-        return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
+        if data_rtype == AudioDataReturnType.ARRAY:
+            cat = np.concatenate
+        else:
+            cat = b"".join
+        return {"audio": cat(chunks), "sampling_rate": sampling_rate}
+    async def _postprocess_audio(
+        self,
+        generator: AsyncGenerator[AudioOutput, None],
+        *,
+        data_rtype: AudioDataReturnType,
+        output_format: AudioOutputFormat,
+    ) -> AsyncGenerator[AudioOutput, None]:
+        """See :meth:`CartesiaTTS._postprocess_audio`."""
+        dtype = None
+        if data_rtype == AudioDataReturnType.ARRAY:
+            dtype = np.float32 if "fp32" in output_format.value else np.int16
+        async for chunk in generator:
+            if dtype is not None:
+                chunk["audio"] = np.frombuffer(chunk["audio"], dtype=dtype)
+            yield chunk
-    @retry_on_connection_error_async(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
+    @retry_on_connection_error_async(
+        max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
+    )
     async def _generate_http_wrapper(self, body: Dict[str, Any]):
         """Need to wrap the http generator in a function for the retry decorator to work."""
         try:
-          async for chunk in self._generate_http(body):
-              yield chunk
+            async for chunk in self._generate_http(body):
+                yield chunk
         except Exception as e:
             logger.error(f"Failed to generate audio. {e}")
             raise e
@@ -605,10 +653,6 @@ class AsyncCartesiaTTS(CartesiaTTS):
     async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
         include_context_id = bool(context_id)
-        route = "audio/websocket"
-        if self.experimental_ws_handle_interrupts:
-            route = f"experimental/{route}"
         if not self.websocket or self._is_websocket_closed():
             await self.refresh_websocket()
@@ -624,15 +668,6 @@ class AsyncCartesiaTTS(CartesiaTTS):
                     break
                 yield convert_response(response, include_context_id)
-                if self.experimental_ws_handle_interrupts:
-                    await ws.send_json({"context_id": context_id})
-        except GeneratorExit:
-            # The exit is only called when the generator is garbage collected.
-            # It may not be called directly after a break statement.
-            # However, the generator will be automatically cancelled on the next request.
-            if self.experimental_ws_handle_interrupts:
-                await ws.send_json({"context_id": context_id, "action": "cancel"})
         except Exception as e:
             if self.websocket and not self._is_websocket_closed():
                 await self.websocket.close()
@@ -642,21 +677,6 @@ class AsyncCartesiaTTS(CartesiaTTS):
             if self.websocket and not self._is_websocket_closed():
                 await self.websocket.close()
-    async def transcribe(self, raw_audio: Union[bytes, str]) -> str:
-        raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
-        data = aiohttp.FormData()
-        data.add_field("clip", raw_audio_bytes, filename="input.wav", content_type="audio/wav")
-        session = await self._get_session()
-        async with session.post(
-            f"{self._http_url()}/audio/transcriptions", headers=headers, data=data
-        ) as response:
-            if not response.ok:
-                raise ValueError(f"Failed to transcribe audio. Error: {await response.text()}")
-            transcript = await response.json()
-            return transcript["text"]
     def __del__(self):
         try:
             loop = asyncio.get_running_loop()

{cartesia-0.0.6 → cartesia-0.1.0}/cartesia/utils.py RENAMED Viewed

@@ -7,6 +7,7 @@ from http.client import RemoteDisconnected
 from httpx import TimeoutException
 from requests.exceptions import ConnectionError
 def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
     """Retry a function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
@@ -15,6 +16,7 @@ def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
         backoff_factor (int): The factor to increase the delay between retries.
         logger (logging.Logger): The logger to use for logging.
     """
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
@@ -22,18 +24,28 @@ def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
             while retry_count < max_retries:
                 try:
                     return func(*args, **kwargs)
-                except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
+                except (
+                    ConnectionError,
+                    RemoteDisconnected,
+                    ServerDisconnectedError,
+                    TimeoutException,
+                ) as e:
                     logger.info(f"Retrying after exception: {e}")
                     retry_count += 1
                     if retry_count < max_retries:
                         delay = backoff_factor * (2 ** (retry_count - 1))
-                        logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
+                        logger.warn(
+                            f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds..."
+                        )
                         time.sleep(delay)
                     else:
                         raise Exception(f"Exception occurred after {max_retries} tries.") from e
         return wrapper
     return decorator
 def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None):
     """Retry an asynchronous function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
@@ -42,6 +54,7 @@ def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None
         backoff_factor (int): The factor to increase the delay between retries.
         logger (logging.Logger): The logger to use for logging.
     """
     def decorator(func):
         @wraps(func)
         async def wrapper(*args, **kwargs):
@@ -52,14 +65,23 @@ def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None
                         yield chunk
                     # If the function completes without raising an exception return
                     return
-                except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
+                except (
+                    ConnectionError,
+                    RemoteDisconnected,
+                    ServerDisconnectedError,
+                    TimeoutException,
+                ) as e:
                     logger.info(f"Retrying after exception: {e}")
                     retry_count += 1
                     if retry_count < max_retries:
                         delay = backoff_factor * (2 ** (retry_count - 1))
-                        logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
+                        logger.warn(
+                            f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds..."
+                        )
                         await asyncio.sleep(delay)
                     else:
                         raise Exception(f"Exception occurred after {max_retries} tries.") from e
         return wrapper
-    return decorator
+    return decorator

cartesia-0.1.0/cartesia/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

{cartesia-0.0.6 → cartesia-0.1.0}/cartesia.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cartesia
-Version: 0.0.6
+Version: 0.1.0
 Summary: The official Python library for the Cartesia API.
 Home-page:
 Author: Cartesia, Inc.

{cartesia-0.0.6 → cartesia-0.1.0}/cartesia.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,7 @@ README.md
 pyproject.toml
 setup.py
 cartesia/__init__.py
+cartesia/_types.py
 cartesia/tts.py
 cartesia/utils.py
 cartesia/version.py

{cartesia-0.0.6 → cartesia-0.1.0}/cartesia.egg-info/requires.txt RENAMED Viewed

@@ -10,6 +10,7 @@ pytest-cov>=4.1.0
 twine
 setuptools
 wheel
+numpy
 [dev]
 pytest>=8.0.2
@@ -17,3 +18,4 @@ pytest-cov>=4.1.0
 twine
 setuptools
 wheel
+numpy

{cartesia-0.0.6 → cartesia-0.1.0}/tests/test_tts.py RENAMED Viewed

@@ -8,15 +8,16 @@ general correctness.
 import logging
 import os
 import sys
-from cartesia.tts import DEFAULT_MODEL_ID, AsyncCartesiaTTS, CartesiaTTS, VoiceMetadata
-from typing import AsyncGenerator, Dict, Generator, List
+from cartesia.tts import DEFAULT_MODEL_ID, AsyncCartesiaTTS, CartesiaTTS
+from cartesia._types import AudioDataReturnType, AudioOutputFormat, VoiceMetadata
+from typing import AsyncGenerator, Dict, Generator, List, Optional, Union
+import numpy as np
 import pytest
 THISDIR = os.path.dirname(__file__)
 sys.path.insert(0, os.path.dirname(THISDIR))
-SAMPLE_VOICE = "Samantha"
+SAMPLE_VOICE = "Newsman"
 logger = logging.getLogger(__name__)
@@ -166,6 +167,78 @@ def test_generate_context_manager_with_err():
     assert websocket.socket.fileno() == -1  # check socket is now closed
+@pytest.mark.parametrize("output_format", [_fmt for _fmt in AudioOutputFormat])
+@pytest.mark.parametrize("as_str", [True, False])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("websocket", [True, False])
+def test_generate_with_output_format(
+    resources: _Resources,
+    output_format: AudioOutputFormat,
+    as_str: bool,
+    stream: bool,
+    websocket: bool,
+):
+    value = output_format.value
+    client = resources.client
+    voices = resources.voices
+    embedding = voices[SAMPLE_VOICE]["embedding"]
+    transcript = "Hello, world!"
+    split = value.split("_")
+    expected_sampling_rate = int(split[1]) if len(split) == 2 else 44_100
+    # Easy way to get around iterating over stream=True / False.
+    output_generate = client.generate(
+        transcript=transcript,
+        voice=embedding,
+        websocket=websocket,
+        stream=stream,
+        output_format=output_format.value if as_str else output_format,
+    )
+    if not stream:
+        output_generate = [output_generate]
+    for out in output_generate:
+        assert isinstance(out["audio"], bytes)
+        assert out["sampling_rate"] == expected_sampling_rate
+@pytest.mark.parametrize("data_rtype", [_fmt for _fmt in AudioDataReturnType])
+@pytest.mark.parametrize("as_str", [True, False])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("websocket", [True, False])
+def test_generate_with_data_rtype(
+    resources: _Resources,
+    data_rtype: AudioDataReturnType,
+    as_str: bool,
+    stream: bool,
+    websocket: bool,
+):
+    client = resources.client
+    voices = resources.voices
+    embedding = voices[SAMPLE_VOICE]["embedding"]
+    transcript = "Hello, world!"
+    # Easy way to get around iterating over stream=True / False.
+    output_generate = client.generate(
+        transcript=transcript,
+        voice=embedding,
+        websocket=websocket,
+        stream=stream,
+        data_rtype=data_rtype.value if as_str else data_rtype,
+    )
+    if not stream:
+        output_generate = [output_generate]
+    for out in output_generate:
+        if data_rtype == AudioDataReturnType.BYTES:
+            assert isinstance(out["audio"], bytes)
+        elif data_rtype == AudioDataReturnType.ARRAY:
+            assert isinstance(out["audio"], np.ndarray)
+            assert out["audio"].dtype == np.float32
 @pytest.mark.parametrize("websocket", [True, False])
 @pytest.mark.asyncio
 async def test_async_generate(resources: _Resources, websocket: bool):
@@ -199,7 +272,9 @@ async def test_async_generate_stream(resources: _Resources, websocket: bool):
     async_client = create_async_client()
     try:
-        generator = await async_client.generate(transcript=transcript, voice=embedding, websocket=websocket, stream=True)
+        generator = await async_client.generate(
+            transcript=transcript, voice=embedding, websocket=websocket, stream=True
+        )
         assert isinstance(generator, AsyncGenerator)
         async for output in generator:
             assert output.keys() == {"audio", "sampling_rate"}
@@ -248,18 +323,111 @@ async def test_generate_async_context_manager_with_err():
     assert websocket.closed  # check websocket is now closed
+@pytest.mark.parametrize("output_format", [_fmt for _fmt in AudioOutputFormat])
+@pytest.mark.parametrize("as_str", [True, False])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.asyncio
+async def test_generate_async_with_output_format(
+    resources: _Resources, output_format: AudioOutputFormat, as_str: bool, stream: bool
+):
+    logger.info(
+        f"Testing async generate stream with output_format={output_format}, as_str={as_str}, stream={stream}"
+    )
+    voices = resources.voices
+    embedding = voices[SAMPLE_VOICE]["embedding"]
+    transcript = "Hello, world!"
+    split = output_format.value.split("_")
+    expected_sampling_rate = int(split[1]) if len(split) == 2 else 44_100
+    def _validate(output):
+        assert isinstance(output["audio"], bytes)
+        assert output["sampling_rate"] == expected_sampling_rate
+    async_client = create_async_client()
+    try:
+        output_generate = await async_client.generate(
+            transcript=transcript,
+            voice=embedding,
+            websocket=False,
+            stream=stream,
+            output_format=output_format.value if as_str else output_format,
+        )
+        if stream:
+            generator = output_generate
+            assert isinstance(generator, AsyncGenerator)
+            async for output in generator:
+                _validate(output)
+        else:
+            _validate(output_generate)
+    finally:
+        # Close the websocket
+        await async_client.close()
+@pytest.mark.parametrize("data_rtype", [_fmt for _fmt in AudioDataReturnType])
+@pytest.mark.parametrize("as_str", [True, False])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("websocket", [True, False])
+@pytest.mark.asyncio
+async def test_generate_async_with_data_rtype(resources: _Resources, data_rtype: AudioDataReturnType, as_str: bool, stream: bool, websocket: bool):
+    voices = resources.voices
+    embedding = voices[SAMPLE_VOICE]["embedding"]
+    transcript = "Hello, world!"
+    async_client = create_async_client()
+    def _validate(output):
+        if data_rtype == AudioDataReturnType.BYTES:
+            assert isinstance(output["audio"], bytes)
+        elif data_rtype == AudioDataReturnType.ARRAY:
+            assert isinstance(output["audio"], np.ndarray)
+            assert output["audio"].dtype == np.float32
+    try:
+        output_generate = await async_client.generate(
+            transcript=transcript,
+            voice=embedding,
+            websocket=False,
+            stream=stream,
+            data_rtype=data_rtype.value if as_str else data_rtype,
+        )
+        if stream:
+            assert isinstance(output_generate, AsyncGenerator)
+            async for output in output_generate:
+                _validate(output)
+        else:
+            _validate(output_generate)
+    finally:
+        # Close the websocket
+        await async_client.close()
 @pytest.mark.parametrize("chunk_time", [0.05, 0.6])
 def test_check_inputs_invalid_chunk_time(client: CartesiaTTS, chunk_time):
     logger.info(f"Testing invalid chunk_time: {chunk_time}")
     with pytest.raises(ValueError, match="`chunk_time` must be between 0.1 and 0.5"):
-        client._check_inputs("Test", None, chunk_time)
+        client._check_inputs(
+            "Test",
+            None,
+            chunk_time,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
 @pytest.mark.parametrize("chunk_time", [0.1, 0.3, 0.5])
 def test_check_inputs_valid_chunk_time(client, chunk_time):
     logger.info("Testing valid chunk_time: {chunk_time}")
     try:
-        client._check_inputs("Test", None, chunk_time)
+        client._check_inputs(
+            "Test",
+            None,
+            chunk_time,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
     except ValueError:
         pytest.fail("Unexpected ValueError raised")
@@ -267,14 +435,26 @@ def test_check_inputs_valid_chunk_time(client, chunk_time):
 def test_check_inputs_duration_less_than_chunk_time(client: CartesiaTTS):
     logger.info("Testing duration less than chunk_time")
     with pytest.raises(ValueError, match="`duration` must be greater than chunk_time"):
-        client._check_inputs("Test", 0.2, 0.3)
+        client._check_inputs(
+            "Test",
+            0.2,
+            0.3,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
 @pytest.mark.parametrize("duration,chunk_time", [(0.5, 0.2), (1.0, 0.5), (2.0, 0.1)])
 def test_check_inputs_valid_duration_and_chunk_time(client: CartesiaTTS, duration, chunk_time):
     logger.info(f"Testing valid duration: {duration} and chunk_time: {chunk_time}")
     try:
-        client._check_inputs("Test", duration, chunk_time)
+        client._check_inputs(
+            "Test",
+            duration,
+            chunk_time,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
     except ValueError:
         pytest.fail("Unexpected ValueError raised")
@@ -282,13 +462,99 @@ def test_check_inputs_valid_duration_and_chunk_time(client: CartesiaTTS, duratio
 def test_check_inputs_empty_transcript(client: CartesiaTTS):
     logger.info("Testing empty transcript")
     with pytest.raises(ValueError, match="`transcript` must be non empty"):
-        client._check_inputs("", None, None)
+        client._check_inputs(
+            "",
+            None,
+            None,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
 @pytest.mark.parametrize("transcript", ["Hello", "Test transcript", "Lorem ipsum dolor sit amet"])
 def test_check_inputs_valid_transcript(client: CartesiaTTS, transcript):
     logger.info(f"Testing valid transcript: {transcript}")
     try:
-        client._check_inputs(transcript, None, None)
+        client._check_inputs(
+            transcript,
+            None,
+            None,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
     except ValueError:
         pytest.fail("Unexpected ValueError raised")
+@pytest.mark.parametrize(
+    "output_format,error",
+    [
+        # Valid output formats.
+        ("fp32", None),
+        ("pcm", None),
+        ("fp32_16000", None),
+        ("fp32_22050", None),
+        ("fp32_44100", None),
+        ("pcm_16000", None),
+        ("pcm_22050", None),
+        ("pcm_44100", None),
+        # Invalid output formats.
+        ("invalid", ValueError),
+        ("pcm_1234", ValueError),  # cannot specify arbitrary sampling rate
+        ("fp32_1234", ValueError),  # cannot specify arbitrary sampling rate
+        ("fp16_44100", ValueError),  # fp16 not supported.
+    ],
+)
+def test_check_inputs_output_format(
+    client: CartesiaTTS, output_format: Union[str, AudioOutputFormat], error: Optional[Exception]
+):
+    if error:
+        with pytest.raises(error):
+            client._check_inputs(
+                "Test",
+                None,
+                None,
+                output_format=output_format,
+                data_rtype=AudioDataReturnType.BYTES,
+            )
+    else:
+        client._check_inputs(
+            "Test",
+            None,
+            None,
+            output_format=output_format,
+            data_rtype=AudioDataReturnType.BYTES,
+        )
+@pytest.mark.parametrize(
+    "data_rtype,error",
+    [
+        # Valid data return types.
+        ("bytes", None),
+        ("array", None),
+        # Invalid data return types.
+        ("invalid", ValueError),
+        ("tensor", ValueError),
+    ],
+)
+def test_check_inputs_data_rtype(
+    client: CartesiaTTS, data_rtype: Union[str, AudioDataReturnType], error: Optional[Exception]
+):
+    if error:
+        with pytest.raises(error):
+            client._check_inputs(
+                "Test",
+                None,
+                None,
+                output_format=AudioOutputFormat.FP32,
+                data_rtype=data_rtype,
+            )
+    else:
+        client._check_inputs(
+            "Test",
+            None,
+            None,
+            output_format=AudioOutputFormat.FP32,
+            data_rtype=data_rtype,
+        )