cartesia 1.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from cartesia.async_client import AsyncCartesia
2
+ from cartesia.client import Cartesia
3
+
4
+ __all__ = ["Cartesia", "AsyncCartesia"]
cartesia/_async_sse.py ADDED
@@ -0,0 +1,95 @@
1
+ import base64
2
+ import json
3
+ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
4
+
5
+ import aiohttp
6
+
7
+ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
8
+ from cartesia._logger import logger
9
+ from cartesia._sse import _SSE
10
+ from cartesia._types import OutputFormat, VoiceControls
11
+ from cartesia.utils.retry import retry_on_connection_error_async
12
+ from cartesia.utils.tts import _construct_tts_request
13
+
14
+
15
+ class _AsyncSSE(_SSE):
16
+ """This class contains methods to generate audio using Server-Sent Events asynchronously."""
17
+
18
+ def __init__(
19
+ self,
20
+ http_url: str,
21
+ headers: Dict[str, str],
22
+ timeout: float,
23
+ get_session: Callable[[], Optional[aiohttp.ClientSession]],
24
+ ):
25
+ super().__init__(http_url, headers, timeout)
26
+ self._get_session = get_session
27
+
28
+ async def send(
29
+ self,
30
+ model_id: str,
31
+ transcript: str,
32
+ output_format: OutputFormat,
33
+ voice_id: Optional[str] = None,
34
+ voice_embedding: Optional[List[float]] = None,
35
+ duration: Optional[int] = None,
36
+ language: Optional[str] = None,
37
+ stream: bool = True,
38
+ _experimental_voice_controls: Optional[VoiceControls] = None,
39
+ ) -> Union[bytes, AsyncGenerator[bytes, None]]:
40
+ request_body = _construct_tts_request(
41
+ model_id=model_id,
42
+ transcript=transcript,
43
+ output_format=output_format,
44
+ voice_id=voice_id,
45
+ voice_embedding=voice_embedding,
46
+ duration=duration,
47
+ language=language,
48
+ _experimental_voice_controls=_experimental_voice_controls,
49
+ )
50
+
51
+ generator = self._sse_generator_wrapper(request_body)
52
+
53
+ if stream:
54
+ return generator
55
+
56
+ chunks = []
57
+ async for chunk in generator:
58
+ chunks.append(chunk["audio"])
59
+
60
+ return {"audio": b"".join(chunks)}
61
+
62
+ @retry_on_connection_error_async(
63
+ max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
64
+ )
65
+ async def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
66
+ """Need to wrap the sse generator in a function for the retry decorator to work."""
67
+ try:
68
+ async for chunk in self._sse_generator(request_body):
69
+ yield chunk
70
+ except Exception as e:
71
+ raise RuntimeError(f"Error generating audio. {e}")
72
+
73
+ async def _sse_generator(self, request_body: Dict[str, Any]):
74
+ session = await self._get_session()
75
+ async with session.post(
76
+ f"{self.http_url}/tts/sse",
77
+ data=json.dumps(request_body),
78
+ headers=self.headers,
79
+ ) as response:
80
+ if not response.ok:
81
+ raise ValueError(f"Failed to generate audio. {await response.text()}")
82
+
83
+ buffer = ""
84
+ async for chunk_bytes in response.content.iter_any():
85
+ buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
86
+ for output in outputs:
87
+ yield output
88
+
89
+ if buffer:
90
+ try:
91
+ chunk_json = json.loads(buffer)
92
+ audio = base64.b64decode(chunk_json["data"])
93
+ yield {"audio": audio}
94
+ except json.JSONDecodeError:
95
+ pass
@@ -0,0 +1,313 @@
1
+ import asyncio
2
+ import uuid
3
+ from collections import defaultdict
4
+ from types import TracebackType
5
+ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
6
+
7
+ import aiohttp
8
+
9
+ from cartesia._constants import DEFAULT_MODEL_ID, DEFAULT_VOICE_EMBEDDING
10
+ from cartesia._types import OutputFormat, VoiceControls
11
+ from cartesia._websocket import _WebSocket
12
+ from cartesia.tts import TTS
13
+ from cartesia.utils.tts import _construct_tts_request
14
+
15
+
16
+ class _AsyncTTSContext:
17
+ """Manage a single context over an AsyncWebSocket.
18
+
19
+ This class separates sending requests and receiving responses into two separate methods.
20
+ This can be used for sending multiple requests without awaiting the response.
21
+ Then you can listen to the responses in the order they were sent. See README for usage.
22
+
23
+ Each AsyncTTSContext will close automatically when a done message is received for that context.
24
+ This happens when the no_more_inputs method is called (equivalent to sending a request with `continue_ = False`),
25
+ or if no requests have been sent for 5 seconds on the same context. It also closes if there is an error.
26
+
27
+ """
28
+
29
+ def __init__(self, context_id: str, websocket: "_AsyncWebSocket", timeout: float):
30
+ self._context_id = context_id
31
+ self._websocket = websocket
32
+ self.timeout = timeout
33
+ self._error = None
34
+
35
+ @property
36
+ def context_id(self) -> str:
37
+ return self._context_id
38
+
39
+ async def send(
40
+ self,
41
+ model_id: str,
42
+ transcript: str,
43
+ output_format: OutputFormat,
44
+ voice_id: Optional[str] = None,
45
+ voice_embedding: Optional[List[float]] = None,
46
+ context_id: Optional[str] = None,
47
+ continue_: bool = False,
48
+ duration: Optional[int] = None,
49
+ language: Optional[str] = None,
50
+ add_timestamps: bool = False,
51
+ _experimental_voice_controls: Optional[VoiceControls] = None,
52
+ ) -> None:
53
+ """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
54
+
55
+ Args:
56
+ model_id: The ID of the model to use for generating audio.
57
+ transcript: The text to convert to speech.
58
+ output_format: A dictionary containing the details of the output format.
59
+ voice_id: The ID of the voice to use for generating audio.
60
+ voice_embedding: The embedding of the voice to use for generating audio.
61
+ context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
62
+ continue_: Whether to continue the audio generation from the previous transcript or not.
63
+ duration: The duration of the audio in seconds.
64
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
65
+ add_timestamps: Whether to return word-level timestamps.
66
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
67
+ Note: This is an experimental feature and may change rapidly in future releases.
68
+
69
+ Returns:
70
+ None.
71
+ """
72
+ if context_id is not None and context_id != self._context_id:
73
+ raise ValueError("Context ID does not match the context ID of the current context.")
74
+ if continue_ and transcript == "":
75
+ raise ValueError("Transcript cannot be empty when continue_ is True.")
76
+
77
+ await self._websocket.connect()
78
+
79
+ request_body = _construct_tts_request(
80
+ model_id=model_id,
81
+ transcript=transcript,
82
+ output_format=output_format,
83
+ voice_id=voice_id,
84
+ voice_embedding=voice_embedding,
85
+ duration=duration,
86
+ language=language,
87
+ context_id=self._context_id,
88
+ add_timestamps=add_timestamps,
89
+ continue_=continue_,
90
+ _experimental_voice_controls=_experimental_voice_controls,
91
+ )
92
+
93
+ await self._websocket.websocket.send_json(request_body)
94
+
95
+ # Start listening for responses on the WebSocket
96
+ self._websocket._dispatch_listener()
97
+
98
+ async def no_more_inputs(self) -> None:
99
+ """Send a request to the WebSocket to indicate that no more requests will be sent."""
100
+ await self.send(
101
+ model_id=DEFAULT_MODEL_ID,
102
+ transcript="",
103
+ output_format=TTS.get_output_format("raw_pcm_f32le_44100"),
104
+ voice_embedding=DEFAULT_VOICE_EMBEDDING, # Default voice embedding since it's a required input for now.
105
+ context_id=self._context_id,
106
+ continue_=False,
107
+ )
108
+
109
+ async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:
110
+ """Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
111
+
112
+ Returns:
113
+ An async generator that yields audio chunks. Each chunk is a dictionary containing the audio as bytes.
114
+ """
115
+ try:
116
+ while True:
117
+ response = await self._websocket._get_message(
118
+ self._context_id, timeout=self.timeout
119
+ )
120
+ if "error" in response:
121
+ raise RuntimeError(f"Error generating audio:\n{response['error']}")
122
+ if response["done"]:
123
+ break
124
+ yield self._websocket._convert_response(response, include_context_id=True)
125
+ except Exception as e:
126
+ if isinstance(e, asyncio.TimeoutError):
127
+ raise RuntimeError("Timeout while waiting for audio chunk")
128
+ raise RuntimeError(f"Failed to generate audio:\n{e}")
129
+ finally:
130
+ self._close()
131
+
132
+ def _close(self) -> None:
133
+ """Closes the context. Automatically called when a done message is received for this context."""
134
+ self._websocket._remove_context(self._context_id)
135
+
136
+ def is_closed(self):
137
+ """Check if the context is closed or not. Returns True if closed."""
138
+ return self._context_id not in self._websocket._context_queues
139
+
140
+ async def __aenter__(self):
141
+ return self
142
+
143
+ async def __aexit__(
144
+ self,
145
+ exc_type: Union[type, None],
146
+ exc: Union[BaseException, None],
147
+ exc_tb: Union[TracebackType, None],
148
+ ):
149
+ self._close()
150
+
151
+ def __del__(self):
152
+ self._close()
153
+
154
+
155
+ class _AsyncWebSocket(_WebSocket):
156
+ """This class contains methods to generate audio using WebSocket asynchronously."""
157
+
158
+ def __init__(
159
+ self,
160
+ ws_url: str,
161
+ api_key: str,
162
+ cartesia_version: str,
163
+ timeout: float,
164
+ get_session: Callable[[], Optional[aiohttp.ClientSession]],
165
+ ):
166
+ """
167
+ Args:
168
+ ws_url: The WebSocket URL for the Cartesia API.
169
+ api_key: The API key to use for authorization.
170
+ cartesia_version: The version of the Cartesia API to use.
171
+ timeout: The timeout for responses on the WebSocket in seconds.
172
+ get_session: A function that returns an aiohttp.ClientSession object.
173
+ """
174
+ super().__init__(ws_url, api_key, cartesia_version)
175
+ self.timeout = timeout
176
+ self._get_session = get_session
177
+ self.websocket = None
178
+ self._context_queues: Dict[str, asyncio.Queue] = {}
179
+ self._processing_task: asyncio.Task = None
180
+
181
+ def __del__(self):
182
+ try:
183
+ loop = asyncio.get_running_loop()
184
+ except RuntimeError:
185
+ loop = None
186
+
187
+ if loop is None:
188
+ asyncio.run(self.close())
189
+ elif loop.is_running():
190
+ loop.create_task(self.close())
191
+
192
+ async def connect(self):
193
+ if self.websocket is None or self._is_websocket_closed():
194
+ route = "tts/websocket"
195
+ session = await self._get_session()
196
+ url = f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
197
+ try:
198
+ self.websocket = await session.ws_connect(url)
199
+ except Exception as e:
200
+ raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
201
+
202
+ def _is_websocket_closed(self):
203
+ return self.websocket.closed
204
+
205
+ async def close(self):
206
+ """This method closes the websocket connection. *Highly* recommended to call this method when done."""
207
+ if self.websocket is not None and not self._is_websocket_closed():
208
+ await self.websocket.close()
209
+ if self._processing_task:
210
+ self._processing_task.cancel()
211
+ try:
212
+ self._processing_task = None
213
+ except asyncio.CancelledError:
214
+ pass
215
+ except TypeError as e:
216
+ # Ignore the error if the task is already cancelled
217
+ # For some reason we are getting None responses
218
+ # TODO: This needs to be fixed - we need to think about why we are getting None responses.
219
+ if "Received message 256:None" not in str(e):
220
+ raise e
221
+
222
+ for context_id in list(self._context_queues.keys()):
223
+ self._remove_context(context_id)
224
+
225
+ self._context_queues.clear()
226
+ self._processing_task = None
227
+ self.websocket = None
228
+
229
+ async def send(
230
+ self,
231
+ model_id: str,
232
+ transcript: str,
233
+ output_format: OutputFormat,
234
+ voice_id: Optional[str] = None,
235
+ voice_embedding: Optional[List[float]] = None,
236
+ context_id: Optional[str] = None,
237
+ duration: Optional[int] = None,
238
+ language: Optional[str] = None,
239
+ stream: bool = True,
240
+ add_timestamps: bool = False,
241
+ _experimental_voice_controls: Optional[VoiceControls] = None,
242
+ ) -> Union[bytes, AsyncGenerator[bytes, None]]:
243
+ """See :meth:`_WebSocket.send` for details."""
244
+ if context_id is None:
245
+ context_id = str(uuid.uuid4())
246
+
247
+ ctx = self.context(context_id)
248
+
249
+ await ctx.send(
250
+ model_id=model_id,
251
+ transcript=transcript,
252
+ output_format=output_format,
253
+ voice_id=voice_id,
254
+ voice_embedding=voice_embedding,
255
+ context_id=context_id,
256
+ duration=duration,
257
+ language=language,
258
+ continue_=False,
259
+ add_timestamps=add_timestamps,
260
+ _experimental_voice_controls=_experimental_voice_controls,
261
+ )
262
+
263
+ generator = ctx.receive()
264
+
265
+ if stream:
266
+ return generator
267
+
268
+ chunks = []
269
+ word_timestamps = defaultdict(list)
270
+ async for chunk in generator:
271
+ if "audio" in chunk:
272
+ chunks.append(chunk["audio"])
273
+ if add_timestamps and "word_timestamps" in chunk:
274
+ for k, v in chunk["word_timestamps"].items():
275
+ word_timestamps[k].extend(v)
276
+ out = {"audio": b"".join(chunks), "context_id": context_id}
277
+ if add_timestamps:
278
+ out["word_timestamps"] = word_timestamps
279
+ return out
280
+
281
+ async def _process_responses(self):
282
+ try:
283
+ while True:
284
+ response = await self.websocket.receive_json()
285
+ if response["context_id"]:
286
+ context_id = response["context_id"]
287
+ if context_id in self._context_queues:
288
+ await self._context_queues[context_id].put(response)
289
+ except Exception as e:
290
+ self._error = e
291
+ raise e
292
+
293
+ async def _get_message(self, context_id: str, timeout: float) -> Dict[str, Any]:
294
+ if context_id not in self._context_queues:
295
+ raise ValueError(f"Context ID {context_id} not found.")
296
+ return await asyncio.wait_for(self._context_queues[context_id].get(), timeout=timeout)
297
+
298
+ def _remove_context(self, context_id: str):
299
+ if context_id in self._context_queues:
300
+ del self._context_queues[context_id]
301
+
302
+ def _dispatch_listener(self):
303
+ if self._processing_task is None or self._processing_task.done():
304
+ self._processing_task = asyncio.create_task(self._process_responses())
305
+
306
+ def context(self, context_id: Optional[str] = None) -> _AsyncTTSContext:
307
+ if context_id in self._context_queues:
308
+ raise ValueError(f"AsyncContext for context ID {context_id} already exists.")
309
+ if context_id is None:
310
+ context_id = str(uuid.uuid4())
311
+ if context_id not in self._context_queues:
312
+ self._context_queues[context_id] = asyncio.Queue()
313
+ return _AsyncTTSContext(context_id, self, self.timeout)
cartesia/_constants.py ADDED
@@ -0,0 +1,10 @@
1
+ DEFAULT_MODEL_ID = "sonic-english" # latest default model
2
+ MULTILINGUAL_MODEL_ID = "sonic-multilingual" # latest multilingual model
3
+ DEFAULT_BASE_URL = "api.cartesia.ai"
4
+ DEFAULT_CARTESIA_VERSION = "2024-06-10" # latest version
5
+ DEFAULT_TIMEOUT = 30 # seconds
6
+ DEFAULT_NUM_CONNECTIONS = 10 # connections per client
7
+ DEFAULT_VOICE_EMBEDDING = [1.0] * 192
8
+
9
+ BACKOFF_FACTOR = 1
10
+ MAX_RETRIES = 3
cartesia/_logger.py ADDED
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
cartesia/_sse.py ADDED
@@ -0,0 +1,143 @@
1
+ import base64
2
+ import json
3
+ from typing import Any, Dict, Generator, List, Optional, Tuple, Union
4
+
5
+ import requests
6
+
7
+ from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
8
+ from cartesia._logger import logger
9
+ from cartesia._types import OutputFormat, VoiceControls
10
+ from cartesia.utils.retry import retry_on_connection_error
11
+ from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
12
+
13
+
14
+ class _SSE:
15
+ """This class contains methods to generate audio using Server-Sent Events.
16
+
17
+ Usage:
18
+ >>> for audio_chunk in client.tts.sse(
19
+ ... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
20
+ ... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}, stream=True
21
+ ... ):
22
+ ... audio = audio_chunk["audio"]
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ http_url: str,
28
+ headers: Dict[str, str],
29
+ timeout: float,
30
+ ):
31
+ self.http_url = http_url
32
+ self.headers = headers
33
+ self.timeout = timeout
34
+
35
+ def _update_buffer(self, buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
36
+ buffer += chunk_bytes.decode("utf-8")
37
+ outputs = []
38
+ while "{" in buffer and "}" in buffer:
39
+ start_index = buffer.find("{")
40
+ end_index = buffer.find("}", start_index)
41
+ if start_index != -1 and end_index != -1:
42
+ try:
43
+ chunk_json = json.loads(buffer[start_index : end_index + 1])
44
+ if "error" in chunk_json:
45
+ raise RuntimeError(f"Error generating audio:\n{chunk_json['error']}")
46
+ if chunk_json["done"]:
47
+ break
48
+ audio = base64.b64decode(chunk_json["data"])
49
+ outputs.append({"audio": audio})
50
+ buffer = buffer[end_index + 1 :]
51
+ except json.JSONDecodeError:
52
+ break
53
+ return buffer, outputs
54
+
55
+ def send(
56
+ self,
57
+ model_id: str,
58
+ transcript: str,
59
+ output_format: OutputFormat,
60
+ voice_id: Optional[str] = None,
61
+ voice_embedding: Optional[List[float]] = None,
62
+ duration: Optional[int] = None,
63
+ language: Optional[str] = None,
64
+ stream: bool = True,
65
+ _experimental_voice_controls: Optional[VoiceControls] = None,
66
+ ) -> Union[bytes, Generator[bytes, None, None]]:
67
+ """Send a request to the server to generate audio using Server-Sent Events.
68
+
69
+ Args:
70
+ model_id: The ID of the model to use for generating audio.
71
+ transcript: The text to convert to speech.
72
+ voice_id: The ID of the voice to use for generating audio.
73
+ voice_embedding: The embedding of the voice to use for generating audio.
74
+ output_format: A dictionary containing the details of the output format.
75
+ duration: The duration of the audio in seconds.
76
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
77
+ stream: Whether to stream the audio or not.
78
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
79
+ Note: This is an experimental feature and may change rapidly in future releases.
80
+
81
+ Returns:
82
+ If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
83
+ If `stream` is False, the method returns a dictionary.
84
+ Both the generator and the dictionary contain the following key(s):
85
+ - audio: The audio as bytes.
86
+ """
87
+ request_body = _construct_tts_request(
88
+ model_id=model_id,
89
+ transcript=transcript,
90
+ output_format=output_format,
91
+ voice_id=voice_id,
92
+ voice_embedding=voice_embedding,
93
+ duration=duration,
94
+ language=language,
95
+ _experimental_voice_controls=_experimental_voice_controls,
96
+ )
97
+
98
+ generator = self._sse_generator_wrapper(request_body)
99
+
100
+ if stream:
101
+ return generator
102
+
103
+ chunks = []
104
+ for chunk in generator:
105
+ chunks.append(chunk["audio"])
106
+
107
+ return {"audio": b"".join(chunks)}
108
+
109
+ @retry_on_connection_error(
110
+ max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
111
+ )
112
+ def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
113
+ """Need to wrap the sse generator in a function for the retry decorator to work."""
114
+ try:
115
+ for chunk in self._sse_generator(request_body):
116
+ yield chunk
117
+ except Exception as e:
118
+ raise RuntimeError(f"Error generating audio. {e}")
119
+
120
+ def _sse_generator(self, request_body: Dict[str, Any]):
121
+ response = requests.post(
122
+ f"{self.http_url}/tts/sse",
123
+ stream=True,
124
+ data=json.dumps(request_body),
125
+ headers=self.headers,
126
+ timeout=(self.timeout, self.timeout),
127
+ )
128
+ if not response.ok:
129
+ raise ValueError(f"Failed to generate audio. {response.text}")
130
+
131
+ buffer = ""
132
+ for chunk_bytes in response.iter_content(chunk_size=None):
133
+ buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
134
+ for output in outputs:
135
+ yield output
136
+
137
+ if buffer:
138
+ try:
139
+ chunk_json = json.loads(buffer)
140
+ audio = base64.b64decode(chunk_json["data"])
141
+ yield {"audio": audio}
142
+ except json.JSONDecodeError:
143
+ pass
cartesia/_types.py ADDED
@@ -0,0 +1,103 @@
1
+ from typing import List, Optional, TypedDict, Union
2
+
3
+ from cartesia.utils.deprecated import deprecated
4
+
5
+
6
+ class OutputFormatMapping:
7
+ _format_mapping = {
8
+ "raw_pcm_f32le_44100": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
9
+ "raw_pcm_s16le_44100": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
10
+ "raw_pcm_f32le_24000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 24000},
11
+ "raw_pcm_s16le_24000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 24000},
12
+ "raw_pcm_f32le_22050": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 22050},
13
+ "raw_pcm_s16le_22050": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 22050},
14
+ "raw_pcm_f32le_16000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 16000},
15
+ "raw_pcm_s16le_16000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
16
+ "raw_pcm_f32le_8000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 8000},
17
+ "raw_pcm_s16le_8000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 8000},
18
+ "raw_pcm_mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
19
+ "raw_pcm_alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
20
+ }
21
+
22
+ @classmethod
23
+ def get_format(cls, format_name):
24
+ if format_name in cls._format_mapping:
25
+ return cls._format_mapping[format_name]
26
+ else:
27
+ raise ValueError(f"Unsupported format: {format_name}")
28
+
29
+
30
+ class DeprecatedOutputFormatMapping:
31
+ """Deprecated formats as of v1.0.1. These will be removed in v1.2.0. Use :class:`OutputFormatMapping` instead."""
32
+
33
+ _format_mapping = {
34
+ "fp32": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
35
+ "pcm": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
36
+ "fp32_8000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 8000},
37
+ "fp32_16000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 16000},
38
+ "fp32_22050": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 22050},
39
+ "fp32_24000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 24000},
40
+ "fp32_44100": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
41
+ "pcm_8000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 8000},
42
+ "pcm_16000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
43
+ "pcm_22050": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 22050},
44
+ "pcm_24000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 24000},
45
+ "pcm_44100": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
46
+ "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
47
+ "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
48
+ }
49
+
50
+ @classmethod
51
+ @deprecated(
52
+ vdeprecated="1.0.1",
53
+ vremove="1.2.0",
54
+ reason="Old output format names are being deprecated in favor of names aligned with the Cartesia API. Use names from `OutputFormatMapping` instead.",
55
+ )
56
+ def get_format_deprecated(cls, format_name):
57
+ if format_name in cls._format_mapping:
58
+ return cls._format_mapping[format_name]
59
+ else:
60
+ raise ValueError(f"Unsupported format: {format_name}")
61
+
62
+
63
+ class VoiceMetadata(TypedDict):
64
+ id: str
65
+ name: str
66
+ description: str
67
+ embedding: List[float]
68
+ is_public: bool
69
+ user_id: str
70
+ created_at: str
71
+ language: str
72
+ base_voice_id: Optional[str] = None
73
+
74
+
75
+ class VoiceControls(TypedDict):
76
+ """Defines different voice control parameters for voice synthesis.
77
+
78
+ For a complete list of supported parameters, refer to the Cartesia API documentation.
79
+ https://docs.cartesia.ai/reference/api-reference
80
+
81
+ Examples:
82
+ >>> {"speed": "fastest"}
83
+ >>> {"speed": "slow", "emotion": ["sadness:high"]}
84
+ >>> {"emotion": ["surprise:highest", "curiosity"]}
85
+
86
+ Note:
87
+ This is an experimental class and is subject to rapid change in future versions.
88
+ """
89
+
90
+ speed: Union[str, float] = ""
91
+ emotion: List[str] = []
92
+
93
+
94
+ class OutputFormat(TypedDict):
95
+ container: str
96
+ encoding: str
97
+ sample_rate: int
98
+
99
+
100
+ class EventType:
101
+ NULL = ""
102
+ AUDIO = "chunk"
103
+ TIMESTAMPS = "timestamps"