cartesia 0.0.4__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from cartesia.tts import CartesiaTTS
1
+ from cartesia.tts import AsyncCartesiaTTS, CartesiaTTS
2
2
 
3
- __all__ = ["CartesiaTTS"]
3
+ __all__ = ["CartesiaTTS", "AsyncCartesiaTTS"]
cartesia/tts.py CHANGED
@@ -1,16 +1,29 @@
1
+ import asyncio
1
2
  import base64
2
3
  import json
3
4
  import os
4
5
  import uuid
5
- from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
+ from types import TracebackType
7
+ from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
6
8
 
9
+ import aiohttp
10
+ import httpx
11
+ import logging
7
12
  import requests
8
13
  from websockets.sync.client import connect
9
14
 
15
+ from cartesia.utils import retry_on_connection_error, retry_on_connection_error_async
16
+
10
17
  DEFAULT_MODEL_ID = "genial-planet-1346"
11
18
  DEFAULT_BASE_URL = "api.cartesia.ai"
12
19
  DEFAULT_API_VERSION = "v0"
20
+ DEFAULT_TIMEOUT = 30 # seconds
21
+ DEFAULT_NUM_CONNECTIONS = 10 # connections per client
22
+
23
+ BACKOFF_FACTOR = 1
24
+ MAX_RETRIES = 3
13
25
 
26
+ logger = logging.getLogger(__name__)
14
27
 
15
28
  class AudioOutput(TypedDict):
16
29
  audio: bytes
@@ -27,6 +40,37 @@ class VoiceMetadata(TypedDict):
27
40
  embedding: Optional[Embedding]
28
41
 
29
42
 
43
+ def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
44
+ buffer += chunk_bytes.decode("utf-8")
45
+ outputs = []
46
+ while "{" in buffer and "}" in buffer:
47
+ start_index = buffer.find("{")
48
+ end_index = buffer.find("}", start_index)
49
+ if start_index != -1 and end_index != -1:
50
+ try:
51
+ chunk_json = json.loads(buffer[start_index : end_index + 1])
52
+ audio = base64.b64decode(chunk_json["data"])
53
+ outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
54
+ buffer = buffer[end_index + 1 :]
55
+ except json.JSONDecodeError:
56
+ break
57
+ return buffer, outputs
58
+
59
+
60
+ def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
61
+ audio = base64.b64decode(response["data"])
62
+
63
+ optional_kwargs = {}
64
+ if include_context_id:
65
+ optional_kwargs["context_id"] = response["context_id"]
66
+
67
+ return {
68
+ "audio": audio,
69
+ "sampling_rate": response["sampling_rate"],
70
+ **optional_kwargs,
71
+ }
72
+
73
+
30
74
  class CartesiaTTS:
31
75
  """The client for Cartesia's text-to-speech library.
32
76
 
@@ -38,7 +82,6 @@ class CartesiaTTS:
38
82
  To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
39
83
 
40
84
  Examples:
41
-
42
85
  >>> client = CartesiaTTS()
43
86
 
44
87
  # Load available voices and their metadata (excluding the embeddings).
@@ -60,14 +103,13 @@ class CartesiaTTS:
60
103
  """
61
104
 
62
105
  def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
63
- """
64
- Args:
65
- api_key: The API key to use for authorization.
66
- If not specified, the API key will be read from the environment variable
67
- `CARTESIA_API_KEY`.
68
- experimental_ws_handle_interrupts: Whether to handle interrupts when generating
69
- audio using the websocket. This is an experimental feature and may have bugs
70
- or be deprecated in the future.
106
+ """Args:
107
+ api_key: The API key to use for authorization.
108
+ If not specified, the API key will be read from the environment variable
109
+ `CARTESIA_API_KEY`.
110
+ experimental_ws_handle_interrupts: Whether to handle interrupts when generating
111
+ audio using the websocket. This is an experimental feature and may have bugs
112
+ or be deprecated in the future.
71
113
  """
72
114
  self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
73
115
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
@@ -75,7 +117,6 @@ class CartesiaTTS:
75
117
  self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
76
118
  self.websocket = None
77
119
  self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
78
- self.refresh_websocket()
79
120
 
80
121
  def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
81
122
  """Returns a mapping from voice name -> voice metadata.
@@ -108,9 +149,14 @@ class CartesiaTTS:
108
149
  >>> audio = client.generate(transcript="Hello world!", voice=embedding)
109
150
  """
110
151
  params = {"select": "id, name, description"} if skip_embeddings else None
111
- response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
152
+ response = httpx.get(
153
+ f"{self._http_url()}/voices",
154
+ headers=self.headers,
155
+ params=params,
156
+ timeout=DEFAULT_TIMEOUT,
157
+ )
112
158
 
113
- if response.status_code != 200:
159
+ if not response.is_success:
114
160
  raise ValueError(f"Failed to get voices. Error: {response.text}")
115
161
 
116
162
  voices = response.json()
@@ -120,6 +166,7 @@ class CartesiaTTS:
120
166
  voice["embedding"] = json.loads(voice["embedding"])
121
167
  return {voice["name"]: voice for voice in voices}
122
168
 
169
+ @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
123
170
  def get_voice_embedding(
124
171
  self, *, voice_id: str = None, filepath: str = None, link: str = None
125
172
  ) -> Embedding:
@@ -142,20 +189,20 @@ class CartesiaTTS:
142
189
 
143
190
  if voice_id:
144
191
  url = f"{self._http_url()}/voices/embedding/{voice_id}"
145
- response = requests.get(url, headers=self.headers)
192
+ response = httpx.get(url, headers=self.headers, timeout=DEFAULT_TIMEOUT)
146
193
  elif filepath:
147
194
  url = f"{self._http_url()}/voices/clone/clip"
148
195
  files = {"clip": open(filepath, "rb")}
149
196
  headers = self.headers.copy()
150
197
  # The default content type of JSON is incorrect for file uploads
151
198
  headers.pop("Content-Type")
152
- response = requests.post(url, headers=headers, files=files)
199
+ response = httpx.post(url, headers=headers, files=files, timeout=DEFAULT_TIMEOUT)
153
200
  elif link:
154
201
  url = f"{self._http_url()}/voices/clone/url"
155
202
  params = {"link": link}
156
- response = requests.post(url, headers=self.headers, params=params)
203
+ response = httpx.post(url, headers=self.headers, params=params, timeout=DEFAULT_TIMEOUT)
157
204
 
158
- if response.status_code != 200:
205
+ if not response.is_success:
159
206
  raise ValueError(
160
207
  f"Failed to clone voice. Status Code: {response.status_code}\n"
161
208
  f"Error: {response.text}"
@@ -173,15 +220,11 @@ class CartesiaTTS:
173
220
  Note:
174
221
  The connection is synchronous.
175
222
  """
176
- if self.websocket and not self._is_websocket_closed():
177
- self.websocket.close()
178
- route = "audio/websocket"
179
- if self.experimental_ws_handle_interrupts:
180
- route = f"experimental/{route}"
181
- self.websocket = connect(
182
- f"{self._ws_url()}/{route}?api_key={self.api_key}",
183
- close_timeout=None,
184
- )
223
+ if self.websocket is None or self._is_websocket_closed():
224
+ route = "audio/websocket"
225
+ if self.experimental_ws_handle_interrupts:
226
+ route = f"experimental/{route}"
227
+ self.websocket = connect(f"{self._ws_url()}/{route}?api_key={self.api_key}")
185
228
 
186
229
  def _is_websocket_closed(self):
187
230
  return self.websocket.socket.fileno() == -1
@@ -200,29 +243,53 @@ class CartesiaTTS:
200
243
  if transcript.strip() == "":
201
244
  raise ValueError("`transcript` must be non empty")
202
245
 
246
+ def _generate_request_body(
247
+ self,
248
+ *,
249
+ transcript: str,
250
+ voice: Embedding,
251
+ model_id: str,
252
+ duration: int = None,
253
+ chunk_time: float = None,
254
+ ) -> Dict[str, Any]:
255
+ """Create the request body for a stream request.
256
+
257
+ Note that anything that's not provided will use a default if available or be
258
+ filtered out otherwise.
259
+ """
260
+ body = dict(transcript=transcript, model_id=model_id, voice=voice)
261
+
262
+ optional_body = dict(
263
+ duration=duration,
264
+ chunk_time=chunk_time,
265
+ )
266
+ body.update({k: v for k, v in optional_body.items() if v is not None})
267
+
268
+ return body
269
+
203
270
  def generate(
204
271
  self,
205
272
  *,
206
273
  transcript: str,
274
+ voice: Embedding,
275
+ model_id: str = DEFAULT_MODEL_ID,
207
276
  duration: int = None,
208
277
  chunk_time: float = None,
209
- voice: Embedding = None,
210
278
  stream: bool = False,
211
279
  websocket: bool = True,
212
280
  ) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
213
281
  """Generate audio from a transcript.
214
282
 
215
283
  Args:
216
- transcript: The text to generate audio for.
217
- duration: The maximum duration of the audio in seconds.
218
- chunk_time: How long each audio segment should be in seconds.
284
+ transcript (str): The text to generate audio for.
285
+ voice (Embedding (List[float])): The voice to use for generating audio.
286
+ duration (int, optional): The maximum duration of the audio in seconds.
287
+ chunk_time (float, optional): How long each audio segment should be in seconds.
219
288
  This should not need to be adjusted.
220
- voice: The voice to use for generating audio.
221
- This can either be a voice id (string) or an embedding vector (List[float]).
222
- stream: Whether to stream the audio or not.
223
- If ``True`` this function returns a generator.
224
- websocket: Whether to use a websocket for streaming audio.
225
- Using the websocket reduces latency by pre-poning the handshake.
289
+ stream (bool, optional): Whether to stream the audio or not.
290
+ If True this function returns a generator. False by default.
291
+ websocket (bool, optional): Whether to use a websocket for streaming audio.
292
+ Using the websocket reduces latency by pre-poning the handshake. True by default.
226
293
 
227
294
  Returns:
228
295
  A generator if `stream` is True, otherwise a dictionary.
@@ -232,19 +299,18 @@ class CartesiaTTS:
232
299
  """
233
300
  self._check_inputs(transcript, duration, chunk_time)
234
301
 
235
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
236
-
237
- optional_body = dict(
238
- duration=duration,
239
- chunk_time=chunk_time,
240
- voice=voice,
302
+ body = self._generate_request_body(
303
+ transcript=transcript,
304
+ voice=voice,
305
+ model_id=model_id,
306
+ duration=duration,
307
+ chunk_time=chunk_time
241
308
  )
242
- body.update({k: v for k, v in optional_body.items() if v is not None})
243
309
 
244
310
  if websocket:
245
311
  generator = self._generate_ws(body)
246
312
  else:
247
- generator = self._generate_http(body)
313
+ generator = self._generate_http_wrapper(body)
248
314
 
249
315
  if stream:
250
316
  return generator
@@ -258,30 +324,32 @@ class CartesiaTTS:
258
324
 
259
325
  return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
260
326
 
327
+ @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
328
+ def _generate_http_wrapper(self, body: Dict[str, Any]):
329
+ """Need to wrap the http generator in a function for the retry decorator to work."""
330
+ try:
331
+ for chunk in self._generate_http(body):
332
+ yield chunk
333
+ except Exception as e:
334
+ logger.error(f"Failed to generate audio. {e}")
335
+ raise e
336
+
261
337
  def _generate_http(self, body: Dict[str, Any]):
262
338
  response = requests.post(
263
339
  f"{self._http_url()}/audio/stream",
264
340
  stream=True,
265
341
  data=json.dumps(body),
266
342
  headers=self.headers,
343
+ timeout=(DEFAULT_TIMEOUT, DEFAULT_TIMEOUT),
267
344
  )
268
- if response.status_code != 200:
345
+ if not response.ok:
269
346
  raise ValueError(f"Failed to generate audio. {response.text}")
270
347
 
271
348
  buffer = ""
272
349
  for chunk_bytes in response.iter_content(chunk_size=None):
273
- buffer += chunk_bytes.decode("utf-8")
274
- while "{" in buffer and "}" in buffer:
275
- start_index = buffer.find("{")
276
- end_index = buffer.find("}", start_index)
277
- if start_index != -1 and end_index != -1:
278
- try:
279
- chunk_json = json.loads(buffer[start_index : end_index + 1])
280
- audio = base64.b64decode(chunk_json["data"])
281
- yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
282
- buffer = buffer[end_index + 1 :]
283
- except json.JSONDecodeError:
284
- break
350
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
351
+ for output in outputs:
352
+ yield output
285
353
 
286
354
  if buffer:
287
355
  try:
@@ -313,17 +381,8 @@ class CartesiaTTS:
313
381
  response = json.loads(self.websocket.recv())
314
382
  if response["done"]:
315
383
  break
316
- audio = base64.b64decode(response["data"])
317
384
 
318
- optional_kwargs = {}
319
- if include_context_id:
320
- optional_kwargs["context_id"] = response["context_id"]
321
-
322
- yield {
323
- "audio": audio,
324
- "sampling_rate": response["sampling_rate"],
325
- **optional_kwargs,
326
- }
385
+ yield convert_response(response, include_context_id)
327
386
 
328
387
  if self.experimental_ws_handle_interrupts:
329
388
  self.websocket.send(json.dumps({"context_id": context_id}))
@@ -334,7 +393,43 @@ class CartesiaTTS:
334
393
  if self.experimental_ws_handle_interrupts:
335
394
  self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
336
395
  except Exception as e:
396
+ # Close the websocket connection if an error occurs.
397
+ if self.websocket and not self._is_websocket_closed():
398
+ self.websocket.close()
337
399
  raise RuntimeError(f"Failed to generate audio. {response}") from e
400
+ finally:
401
+ # Ensure the websocket is ultimately closed.
402
+ if self.websocket and not self._is_websocket_closed():
403
+ self.websocket.close()
404
+
405
+ @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
406
+ def transcribe(self, raw_audio: Union[bytes, str]) -> str:
407
+ raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
408
+ response = httpx.post(
409
+ f"{self._http_url()}/audio/transcriptions",
410
+ headers=headers,
411
+ files={"clip": ("input.wav", raw_audio_bytes)},
412
+ timeout=DEFAULT_TIMEOUT,
413
+ )
414
+
415
+ if not response.is_success:
416
+ raise ValueError(f"Failed to transcribe audio. Error: {response.text()}")
417
+
418
+ transcript = response.json()
419
+ return transcript["text"]
420
+
421
+
422
+ def prepare_audio_and_headers(
423
+ self, raw_audio: Union[bytes, str]
424
+ ) -> Tuple[bytes, Dict[str, Any]]:
425
+ if isinstance(raw_audio, str):
426
+ with open(raw_audio, "rb") as f:
427
+ raw_audio_bytes = f.read()
428
+ else:
429
+ raw_audio_bytes = raw_audio
430
+ # application/json is not the right content type for this request
431
+ headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
432
+ return raw_audio_bytes, headers
338
433
 
339
434
  def _http_url(self):
340
435
  prefix = "http" if "localhost" in self.base_url else "https"
@@ -344,6 +439,235 @@ class CartesiaTTS:
344
439
  prefix = "ws" if "localhost" in self.base_url else "wss"
345
440
  return f"{prefix}://{self.base_url}/{self.api_version}"
346
441
 
347
- def __del__(self):
348
- if self.websocket.socket.fileno() > -1:
442
+ def close(self):
443
+ if self.websocket and not self._is_websocket_closed():
349
444
  self.websocket.close()
445
+
446
+ def __del__(self):
447
+ self.close()
448
+
449
+ def __enter__(self):
450
+ self.refresh_websocket()
451
+ return self
452
+
453
+ def __exit__(
454
+ self,
455
+ exc_type: Union[type, None],
456
+ exc: Union[BaseException, None],
457
+ exc_tb: Union[TracebackType, None],
458
+ ):
459
+ self.close()
460
+
461
+
462
+ class AsyncCartesiaTTS(CartesiaTTS):
463
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
464
+ self._session = None
465
+ self._loop = None
466
+ super().__init__(
467
+ api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
468
+ )
469
+
470
+ async def _get_session(self):
471
+ current_loop = asyncio.get_event_loop()
472
+ if self._loop is not current_loop:
473
+ # If the loop has changed, close the session and create a new one.
474
+ await self.close()
475
+ if self._session is None or self._session.closed:
476
+ timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
477
+ connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
478
+ self._session = aiohttp.ClientSession(
479
+ timeout=timeout, connector=connector
480
+ )
481
+ self._loop = current_loop
482
+ return self._session
483
+
484
+ async def refresh_websocket(self):
485
+ """Refresh the websocket connection."""
486
+ if self.websocket is None or self._is_websocket_closed():
487
+ route = "audio/websocket"
488
+ if self.experimental_ws_handle_interrupts:
489
+ route = f"experimental/{route}"
490
+ session = await self._get_session()
491
+ self.websocket = await session.ws_connect(
492
+ f"{self._ws_url()}/{route}?api_key={self.api_key}"
493
+ )
494
+
495
+ def _is_websocket_closed(self):
496
+ return self.websocket.closed
497
+
498
+ async def close(self):
499
+ """This method closes the websocket and the session.
500
+
501
+ It is *strongly* recommended to call this method when you are done using the client.
502
+ """
503
+ if self.websocket is not None and not self._is_websocket_closed():
504
+ await self.websocket.close()
505
+ if self._session is not None and not self._session.closed:
506
+ await self._session.close()
507
+
508
+ async def generate(
509
+ self,
510
+ *,
511
+ transcript: str,
512
+ voice: Embedding,
513
+ model_id: str = DEFAULT_MODEL_ID,
514
+ duration: int = None,
515
+ chunk_time: float = None,
516
+ stream: bool = False,
517
+ websocket: bool = True,
518
+ ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
519
+ """Asynchronously generate audio from a transcript.
520
+ NOTE: This overrides the non-asynchronous generate method from the base class.
521
+
522
+ Args:
523
+ transcript (str): The text to generate audio for.
524
+ voice (Embedding (List[float])): The voice to use for generating audio.
525
+ duration (int, optional): The maximum duration of the audio in seconds.
526
+ chunk_time (float, optional): How long each audio segment should be in seconds.
527
+ This should not need to be adjusted.
528
+ stream (bool, optional): Whether to stream the audio or not.
529
+ If True this function returns a generator. False by default.
530
+ websocket (bool, optional): Whether to use a websocket for streaming audio.
531
+ Using the websocket reduces latency by pre-poning the handshake. True by default.
532
+
533
+ Returns:
534
+ A generator if `stream` is True, otherwise a dictionary.
535
+ Dictionary from both generator and non-generator return types have the following keys:
536
+ * "audio": The audio as a bytes buffer.
537
+ * "sampling_rate": The sampling rate of the audio.
538
+ """
539
+ self._check_inputs(transcript, duration, chunk_time)
540
+
541
+ body = self._generate_request_body(
542
+ transcript=transcript,
543
+ voice=voice,
544
+ model_id=model_id,
545
+ duration=duration,
546
+ chunk_time=chunk_time
547
+ )
548
+
549
+ if websocket:
550
+ generator = self._generate_ws(body)
551
+ else:
552
+ generator = self._generate_http_wrapper(body)
553
+
554
+ if stream:
555
+ return generator
556
+
557
+ chunks = []
558
+ sampling_rate = None
559
+ async for chunk in generator:
560
+ if sampling_rate is None:
561
+ sampling_rate = chunk["sampling_rate"]
562
+ chunks.append(chunk["audio"])
563
+
564
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
565
+
566
+ @retry_on_connection_error_async(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
567
+ async def _generate_http_wrapper(self, body: Dict[str, Any]):
568
+ """Need to wrap the http generator in a function for the retry decorator to work."""
569
+ try:
570
+ async for chunk in self._generate_http(body):
571
+ yield chunk
572
+ except Exception as e:
573
+ logger.error(f"Failed to generate audio. {e}")
574
+ raise e
575
+
576
+ async def _generate_http(self, body: Dict[str, Any]):
577
+ session = await self._get_session()
578
+ async with session.post(
579
+ f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
580
+ ) as response:
581
+ if not response.ok:
582
+ raise ValueError(f"Failed to generate audio. {await response.text()}")
583
+
584
+ buffer = ""
585
+ async for chunk_bytes in response.content.iter_any():
586
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
587
+ for output in outputs:
588
+ yield output
589
+
590
+ if buffer:
591
+ try:
592
+ chunk_json = json.loads(buffer)
593
+ audio = base64.b64decode(chunk_json["data"])
594
+ yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
595
+ except json.JSONDecodeError:
596
+ pass
597
+
598
+ async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
599
+ include_context_id = bool(context_id)
600
+ route = "audio/websocket"
601
+ if self.experimental_ws_handle_interrupts:
602
+ route = f"experimental/{route}"
603
+
604
+ if not self.websocket or self._is_websocket_closed():
605
+ await self.refresh_websocket()
606
+
607
+ ws = self.websocket
608
+ if context_id is None:
609
+ context_id = uuid.uuid4().hex
610
+ await ws.send_json({"data": body, "context_id": context_id})
611
+ try:
612
+ response = None
613
+ while True:
614
+ response = await ws.receive_json()
615
+ if response["done"]:
616
+ break
617
+
618
+ yield convert_response(response, include_context_id)
619
+
620
+ if self.experimental_ws_handle_interrupts:
621
+ await ws.send_json({"context_id": context_id})
622
+ except GeneratorExit:
623
+ # The exit is only called when the generator is garbage collected.
624
+ # It may not be called directly after a break statement.
625
+ # However, the generator will be automatically cancelled on the next request.
626
+ if self.experimental_ws_handle_interrupts:
627
+ await ws.send_json({"context_id": context_id, "action": "cancel"})
628
+ except Exception as e:
629
+ if self.websocket and not self._is_websocket_closed():
630
+ await self.websocket.close()
631
+ raise RuntimeError(f"Failed to generate audio. {await response.text()}") from e
632
+ finally:
633
+ # Ensure the websocket is ultimately closed.
634
+ if self.websocket and not self._is_websocket_closed():
635
+ await self.websocket.close()
636
+
637
+ async def transcribe(self, raw_audio: Union[bytes, str]) -> str:
638
+ raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
639
+ data = aiohttp.FormData()
640
+ data.add_field("clip", raw_audio_bytes, filename="input.wav", content_type="audio/wav")
641
+ session = await self._get_session()
642
+
643
+ async with session.post(
644
+ f"{self._http_url()}/audio/transcriptions", headers=headers, data=data
645
+ ) as response:
646
+ if not response.ok:
647
+ raise ValueError(f"Failed to transcribe audio. Error: {await response.text()}")
648
+
649
+ transcript = await response.json()
650
+ return transcript["text"]
651
+
652
+ def __del__(self):
653
+ try:
654
+ loop = asyncio.get_running_loop()
655
+ except RuntimeError:
656
+ loop = None
657
+
658
+ if loop is None:
659
+ asyncio.run(self.close())
660
+ else:
661
+ loop.create_task(self.close())
662
+
663
+ async def __aenter__(self):
664
+ await self.refresh_websocket()
665
+ return self
666
+
667
+ async def __aexit__(
668
+ self,
669
+ exc_type: Union[type, None],
670
+ exc: Union[BaseException, None],
671
+ exc_tb: Union[TracebackType, None],
672
+ ):
673
+ await self.close()
cartesia/utils.py ADDED
@@ -0,0 +1,65 @@
1
+ import time
2
+
3
+ from aiohttp.client_exceptions import ServerDisconnectedError
4
+ import asyncio
5
+ from functools import wraps
6
+ from http.client import RemoteDisconnected
7
+ from httpx import TimeoutException
8
+ from requests.exceptions import ConnectionError
9
+
10
+ def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
11
+ """Retry a function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
12
+
13
+ Args:
14
+ max_retries (int): The maximum number of retries.
15
+ backoff_factor (int): The factor to increase the delay between retries.
16
+ logger (logging.Logger): The logger to use for logging.
17
+ """
18
+ def decorator(func):
19
+ @wraps(func)
20
+ def wrapper(*args, **kwargs):
21
+ retry_count = 0
22
+ while retry_count < max_retries:
23
+ try:
24
+ return func(*args, **kwargs)
25
+ except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
26
+ logger.info(f"Retrying after exception: {e}")
27
+ retry_count += 1
28
+ if retry_count < max_retries:
29
+ delay = backoff_factor * (2 ** (retry_count - 1))
30
+ logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
31
+ time.sleep(delay)
32
+ else:
33
+ raise Exception(f"Exception occurred after {max_retries} tries.") from e
34
+ return wrapper
35
+ return decorator
36
+
37
+ def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None):
38
+ """Retry an asynchronous function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
39
+
40
+ Args:
41
+ max_retries (int): The maximum number of retries.
42
+ backoff_factor (int): The factor to increase the delay between retries.
43
+ logger (logging.Logger): The logger to use for logging.
44
+ """
45
+ def decorator(func):
46
+ @wraps(func)
47
+ async def wrapper(*args, **kwargs):
48
+ retry_count = 0
49
+ while retry_count < max_retries:
50
+ try:
51
+ async for chunk in func(*args, **kwargs):
52
+ yield chunk
53
+ # If the function completes without raising an exception return
54
+ return
55
+ except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
56
+ logger.info(f"Retrying after exception: {e}")
57
+ retry_count += 1
58
+ if retry_count < max_retries:
59
+ delay = backoff_factor * (2 ** (retry_count - 1))
60
+ logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
61
+ await asyncio.sleep(delay)
62
+ else:
63
+ raise Exception(f"Exception occurred after {max_retries} tries.") from e
64
+ return wrapper
65
+ return decorator
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.4"
1
+ __version__ = "0.0.5"
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.1
2
+ Name: cartesia
3
+ Version: 0.0.5
4
+ Summary: The official Python library for the Cartesia API.
5
+ Home-page:
6
+ Author: Cartesia, Inc.
7
+ Author-email: support@cartesia.ai
8
+ Classifier: Programming Language :: Python
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Requires-Python: >=3.8.0
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: aiohttp
14
+ Requires-Dist: httpx
15
+ Requires-Dist: pytest-asyncio
16
+ Requires-Dist: requests
17
+ Requires-Dist: websockets
18
+ Provides-Extra: all
19
+ Requires-Dist: pytest >=8.0.2 ; extra == 'all'
20
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
21
+ Requires-Dist: twine ; extra == 'all'
22
+ Requires-Dist: setuptools ; extra == 'all'
23
+ Requires-Dist: wheel ; extra == 'all'
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
26
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
27
+ Requires-Dist: twine ; extra == 'dev'
28
+ Requires-Dist: setuptools ; extra == 'dev'
29
+ Requires-Dist: wheel ; extra == 'dev'
30
+
31
+
32
+ # Cartesia Python API Library
33
+ The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
34
+
35
+ **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
36
+
37
+ ## Installation
38
+ ```bash
39
+ pip install cartesia
40
+
41
+ # pip install in editable mode w/ dev dependencies
42
+ pip install -e '.[dev]'
43
+ ```
44
+
45
+ ## Usage
46
+ ```python
47
+ from cartesia.tts import CartesiaTTS
48
+ import pyaudio
49
+ import os
50
+
51
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
52
+ voices = client.get_voices()
53
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
54
+ transcript = "Hello! Welcome to Cartesia"
55
+ model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
56
+
57
+ p = pyaudio.PyAudio()
58
+
59
+ stream = None
60
+
61
+ # Generate and stream audio
62
+ for output in client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
63
+ buffer = output["audio"]
64
+ rate = output["sampling_rate"]
65
+
66
+ if not stream:
67
+ stream = p.open(format=pyaudio.paFloat32,
68
+ channels=1,
69
+ rate=rate,
70
+ output=True)
71
+
72
+ # Write the audio data to the stream
73
+ stream.write(buffer)
74
+
75
+ stream.stop_stream()
76
+ stream.close()
77
+ p.terminate()
78
+ ```
79
+
80
+ You can also use the async client if you want to make asynchronous API calls:
81
+ ```python
82
+ from cartesia.tts import AsyncCartesiaTTS
83
+ import asyncio
84
+ import pyaudio
85
+ import os
86
+
87
+ async def write_stream():
88
+ client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
89
+ voices = client.get_voices()
90
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
91
+ transcript = "Hello! Welcome to Cartesia"
92
+ model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
93
+
94
+ p = pyaudio.PyAudio()
95
+
96
+ stream = None
97
+
98
+ # Generate and stream audio
99
+ async for output in await client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
100
+ buffer = output["audio"]
101
+ rate = output["sampling_rate"]
102
+
103
+ if not stream:
104
+ stream = p.open(format=pyaudio.paFloat32,
105
+ channels=1,
106
+ rate=rate,
107
+ output=True)
108
+
109
+ # Write the audio data to the stream
110
+ stream.write(buffer)
111
+
112
+ stream.stop_stream()
113
+ stream.close()
114
+ p.terminate()
115
+
116
+ asyncio.run(write_stream())
117
+ ```
118
+
119
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
120
+ Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
121
+
122
+ ```python
123
+ from IPython.display import Audio
124
+ import io
125
+ import os
126
+ import numpy as np
127
+
128
+ from cartesia.tts import CartesiaTTS
129
+
130
+ with CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
131
+ voices = client.get_voices()
132
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
133
+ transcript = "Hello! Welcome to Cartesia"
134
+
135
+ # Create a BytesIO object to store the audio data
136
+ audio_data = io.BytesIO()
137
+
138
+ # Generate and stream audio
139
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
140
+ buffer = output["audio"]
141
+ audio_data.write(buffer)
142
+
143
+ # Set the cursor position to the beginning of the BytesIO object
144
+ audio_data.seek(0)
145
+
146
+ # Create an Audio object from the BytesIO data
147
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
148
+
149
+ # Display the Audio object
150
+ display(audio)
151
+ ```
152
+
153
+ Below is the same example using the async client:
154
+ ```python
155
+ from IPython.display import Audio
156
+ import io
157
+ import os
158
+ import numpy as np
159
+
160
+ from cartesia.tts import AsyncCartesiaTTS
161
+
162
+ async with AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
163
+ voices = client.get_voices()
164
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
165
+ transcript = "Hello! Welcome to Cartesia"
166
+
167
+ # Create a BytesIO object to store the audio data
168
+ audio_data = io.BytesIO()
169
+
170
+ # Generate and stream audio
171
+ async for output in await client.generate(transcript=transcript, voice=voice, stream=True):
172
+ buffer = output["audio"]
173
+ audio_data.write(buffer)
174
+
175
+ # Set the cursor position to the beginning of the BytesIO object
176
+ audio_data.seek(0)
177
+
178
+ # Create an Audio object from the BytesIO data
179
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
180
+
181
+ # Display the Audio object
182
+ display(audio)
183
+ ```
184
+
185
+ To avoid storing your API key in the source code, we recommend doing one of the following:
186
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
187
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -0,0 +1,8 @@
1
+ cartesia/__init__.py,sha256=uIc9xGNPs8_A6eAvbTUY1geazunYoEZVWFKhCwC9TRA,102
2
+ cartesia/tts.py,sha256=Gtm9qse83g3SX4-KbmlxOAvTQcZIjmUkMfBKu2Xf9rY,26449
3
+ cartesia/utils.py,sha256=GoTJe8LZ3WpS4hXkwoZauPYjo7Mbx7BvbBjAX5vEbwg,3024
4
+ cartesia/version.py,sha256=S7u1lbuWmM3A3ajykBialmPoJUK6Jg-WmNqM-9OZFdk,22
5
+ cartesia-0.0.5.dist-info/METADATA,sha256=oK64bcTyLhrosXh9FjuEwB2SUdQzbYsxzOWCnf6qaI4,5974
6
+ cartesia-0.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
7
+ cartesia-0.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
8
+ cartesia-0.0.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
@@ -1,115 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: cartesia
3
- Version: 0.0.4
4
- Summary: The official Python library for the Cartesia API.
5
- Home-page:
6
- Author: Cartesia, Inc.
7
- Author-email: support@cartesia.ai
8
- Classifier: Programming Language :: Python
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
- Requires-Python: >=3.8.0
12
- Description-Content-Type: text/markdown
13
- Requires-Dist: websockets
14
- Requires-Dist: requests
15
- Provides-Extra: all
16
- Requires-Dist: pre-commit ; extra == 'all'
17
- Requires-Dist: docformatter ; extra == 'all'
18
- Requires-Dist: black ==24.1.1 ; extra == 'all'
19
- Requires-Dist: isort ==5.13.2 ; extra == 'all'
20
- Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
21
- Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
22
- Requires-Dist: pytest >=8.0.2 ; extra == 'all'
23
- Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
24
- Provides-Extra: dev
25
- Requires-Dist: pre-commit ; extra == 'dev'
26
- Requires-Dist: docformatter ; extra == 'dev'
27
- Requires-Dist: black ==24.1.1 ; extra == 'dev'
28
- Requires-Dist: isort ==5.13.2 ; extra == 'dev'
29
- Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
30
- Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
31
- Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
32
- Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
33
-
34
-
35
- # Cartesia Python API Library
36
- The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
37
-
38
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
39
-
40
- ## Installation
41
- ```bash
42
- pip install cartesia
43
-
44
- # pip install in editable mode w/ dev dependencies
45
- pip install -e '.[dev]'
46
- ```
47
-
48
- ## Usage
49
- ```python
50
- from cartesia.tts import CartesiaTTS
51
- import pyaudio
52
- import os
53
-
54
- client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
- voices = client.get_voices()
56
- voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
57
- transcript = "Hello! Welcome to Cartesia"
58
-
59
- p = pyaudio.PyAudio()
60
-
61
- stream = None
62
-
63
- # Generate and stream audio
64
- for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
- buffer = output["audio"]
66
- rate = output["sampling_rate"]
67
-
68
- if not stream:
69
- stream = p.open(format=pyaudio.paFloat32,
70
- channels=1,
71
- rate=rate,
72
- output=True)
73
-
74
- # Write the audio data to the stream
75
- stream.write(buffer)
76
-
77
- stream.stop_stream()
78
- stream.close()
79
- p.terminate()
80
- ```
81
-
82
- If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
-
84
- ```python
85
- from cartesia.tts import CartesiaTTS
86
- from IPython.display import Audio
87
- import io
88
- import os
89
-
90
- client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
- voices = client.get_voices()
92
- voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
- transcript = "Hello! Welcome to Cartesia"
94
-
95
- # Create a BytesIO object to store the audio data
96
- audio_data = io.BytesIO()
97
-
98
- # Generate and stream audio
99
- for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
- buffer = output["audio"]
101
- audio_data.write(buffer)
102
-
103
- # Set the cursor position to the beginning of the BytesIO object
104
- audio_data.seek(0)
105
-
106
- # Create an Audio object from the BytesIO data
107
- audio = Audio(audio_data, rate=output["sampling_rate"])
108
-
109
- # Display the Audio object
110
- display(audio)
111
- ```
112
-
113
- To avoid storing your API key in the source code, we recommend doing one of the following:
114
- 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
115
- 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -1,7 +0,0 @@
1
- cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
- cartesia/tts.py,sha256=9m9_kqscMY0yzUU0Ty5k2HoeMqfrIbHouaS-ymcr64s,14127
3
- cartesia/version.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
4
- cartesia-0.0.4.dist-info/METADATA,sha256=tLUrKLREJiXrW-pfd3k61i9CnElKHk5RAyidCMxpR-s,3752
5
- cartesia-0.0.4.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
- cartesia-0.0.4.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
- cartesia-0.0.4.dist-info/RECORD,,