cartesia 0.0.3__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from cartesia.tts import CartesiaTTS
1
+ from cartesia.tts import AsyncCartesiaTTS, CartesiaTTS
2
2
 
3
- __all__ = ["CartesiaTTS"]
3
+ __all__ = ["CartesiaTTS", "AsyncCartesiaTTS"]
cartesia/tts.py CHANGED
@@ -1,16 +1,29 @@
1
+ import asyncio
1
2
  import base64
2
3
  import json
3
4
  import os
4
5
  import uuid
5
- from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
+ from types import TracebackType
7
+ from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
6
8
 
9
+ import aiohttp
10
+ import httpx
11
+ import logging
7
12
  import requests
8
13
  from websockets.sync.client import connect
9
14
 
15
+ from cartesia.utils import retry_on_connection_error, retry_on_connection_error_async
16
+
10
17
  DEFAULT_MODEL_ID = "genial-planet-1346"
11
18
  DEFAULT_BASE_URL = "api.cartesia.ai"
12
19
  DEFAULT_API_VERSION = "v0"
20
+ DEFAULT_TIMEOUT = 30 # seconds
21
+ DEFAULT_NUM_CONNECTIONS = 10 # connections per client
22
+
23
+ BACKOFF_FACTOR = 1
24
+ MAX_RETRIES = 3
13
25
 
26
+ logger = logging.getLogger(__name__)
14
27
 
15
28
  class AudioOutput(TypedDict):
16
29
  audio: bytes
@@ -27,14 +40,48 @@ class VoiceMetadata(TypedDict):
27
40
  embedding: Optional[Embedding]
28
41
 
29
42
 
43
+ def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
44
+ buffer += chunk_bytes.decode("utf-8")
45
+ outputs = []
46
+ while "{" in buffer and "}" in buffer:
47
+ start_index = buffer.find("{")
48
+ end_index = buffer.find("}", start_index)
49
+ if start_index != -1 and end_index != -1:
50
+ try:
51
+ chunk_json = json.loads(buffer[start_index : end_index + 1])
52
+ audio = base64.b64decode(chunk_json["data"])
53
+ outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
54
+ buffer = buffer[end_index + 1 :]
55
+ except json.JSONDecodeError:
56
+ break
57
+ return buffer, outputs
58
+
59
+
60
+ def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
61
+ audio = base64.b64decode(response["data"])
62
+
63
+ optional_kwargs = {}
64
+ if include_context_id:
65
+ optional_kwargs["context_id"] = response["context_id"]
66
+
67
+ return {
68
+ "audio": audio,
69
+ "sampling_rate": response["sampling_rate"],
70
+ **optional_kwargs,
71
+ }
72
+
73
+
30
74
  class CartesiaTTS:
31
75
  """The client for Cartesia's text-to-speech library.
32
76
 
33
77
  This client contains methods to interact with the Cartesia text-to-speech API.
34
- The API offers
78
+ The client can be used to retrieve available voices, compute new voice embeddings,
79
+ and generate speech from text.
35
80
 
36
- Examples:
81
+ The client also supports generating audio using a websocket for lower latency.
82
+ To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
37
83
 
84
+ Examples:
38
85
  >>> client = CartesiaTTS()
39
86
 
40
87
  # Load available voices and their metadata (excluding the embeddings).
@@ -55,19 +102,21 @@ class CartesiaTTS:
55
102
  ... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
56
103
  """
57
104
 
58
- def __init__(self, *, api_key: str = None):
59
- """
60
- Args:
61
- api_key: The API key to use for authorization.
62
- If not specified, the API key will be read from the environment variable
63
- `CARTESIA_API_KEY`.
105
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
106
+ """Args:
107
+ api_key: The API key to use for authorization.
108
+ If not specified, the API key will be read from the environment variable
109
+ `CARTESIA_API_KEY`.
110
+ experimental_ws_handle_interrupts: Whether to handle interrupts when generating
111
+ audio using the websocket. This is an experimental feature and may have bugs
112
+ or be deprecated in the future.
64
113
  """
65
114
  self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
66
115
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
67
116
  self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
68
117
  self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
69
118
  self.websocket = None
70
- self.refresh_websocket()
119
+ self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
71
120
 
72
121
  def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
73
122
  """Returns a mapping from voice name -> voice metadata.
@@ -100,9 +149,14 @@ class CartesiaTTS:
100
149
  >>> audio = client.generate(transcript="Hello world!", voice=embedding)
101
150
  """
102
151
  params = {"select": "id, name, description"} if skip_embeddings else None
103
- response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
152
+ response = httpx.get(
153
+ f"{self._http_url()}/voices",
154
+ headers=self.headers,
155
+ params=params,
156
+ timeout=DEFAULT_TIMEOUT,
157
+ )
104
158
 
105
- if response.status_code != 200:
159
+ if not response.is_success:
106
160
  raise ValueError(f"Failed to get voices. Error: {response.text}")
107
161
 
108
162
  voices = response.json()
@@ -112,6 +166,7 @@ class CartesiaTTS:
112
166
  voice["embedding"] = json.loads(voice["embedding"])
113
167
  return {voice["name"]: voice for voice in voices}
114
168
 
169
+ @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
115
170
  def get_voice_embedding(
116
171
  self, *, voice_id: str = None, filepath: str = None, link: str = None
117
172
  ) -> Embedding:
@@ -134,20 +189,20 @@ class CartesiaTTS:
134
189
 
135
190
  if voice_id:
136
191
  url = f"{self._http_url()}/voices/embedding/{voice_id}"
137
- response = requests.get(url, headers=self.headers)
192
+ response = httpx.get(url, headers=self.headers, timeout=DEFAULT_TIMEOUT)
138
193
  elif filepath:
139
194
  url = f"{self._http_url()}/voices/clone/clip"
140
195
  files = {"clip": open(filepath, "rb")}
141
196
  headers = self.headers.copy()
142
197
  # The default content type of JSON is incorrect for file uploads
143
198
  headers.pop("Content-Type")
144
- response = requests.post(url, headers=headers, files=files)
199
+ response = httpx.post(url, headers=headers, files=files, timeout=DEFAULT_TIMEOUT)
145
200
  elif link:
146
201
  url = f"{self._http_url()}/voices/clone/url"
147
202
  params = {"link": link}
148
- response = requests.post(url, headers=self.headers, params=params)
203
+ response = httpx.post(url, headers=self.headers, params=params, timeout=DEFAULT_TIMEOUT)
149
204
 
150
- if response.status_code != 200:
205
+ if not response.is_success:
151
206
  raise ValueError(
152
207
  f"Failed to clone voice. Status Code: {response.status_code}\n"
153
208
  f"Error: {response.text}"
@@ -165,12 +220,11 @@ class CartesiaTTS:
165
220
  Note:
166
221
  The connection is synchronous.
167
222
  """
168
- if self.websocket and not self._is_websocket_closed():
169
- self.websocket.close()
170
- self.websocket = connect(
171
- f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
172
- close_timeout=None,
173
- )
223
+ if self.websocket is None or self._is_websocket_closed():
224
+ route = "audio/websocket"
225
+ if self.experimental_ws_handle_interrupts:
226
+ route = f"experimental/{route}"
227
+ self.websocket = connect(f"{self._ws_url()}/{route}?api_key={self.api_key}")
174
228
 
175
229
  def _is_websocket_closed(self):
176
230
  return self.websocket.socket.fileno() == -1
@@ -189,29 +243,53 @@ class CartesiaTTS:
189
243
  if transcript.strip() == "":
190
244
  raise ValueError("`transcript` must be non empty")
191
245
 
246
+ def _generate_request_body(
247
+ self,
248
+ *,
249
+ transcript: str,
250
+ voice: Embedding,
251
+ model_id: str,
252
+ duration: int = None,
253
+ chunk_time: float = None,
254
+ ) -> Dict[str, Any]:
255
+ """Create the request body for a stream request.
256
+
257
+ Note that anything that's not provided will use a default if available or be
258
+ filtered out otherwise.
259
+ """
260
+ body = dict(transcript=transcript, model_id=model_id, voice=voice)
261
+
262
+ optional_body = dict(
263
+ duration=duration,
264
+ chunk_time=chunk_time,
265
+ )
266
+ body.update({k: v for k, v in optional_body.items() if v is not None})
267
+
268
+ return body
269
+
192
270
  def generate(
193
271
  self,
194
272
  *,
195
273
  transcript: str,
274
+ voice: Embedding,
275
+ model_id: str = DEFAULT_MODEL_ID,
196
276
  duration: int = None,
197
277
  chunk_time: float = None,
198
- voice: Embedding = None,
199
278
  stream: bool = False,
200
279
  websocket: bool = True,
201
280
  ) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
202
281
  """Generate audio from a transcript.
203
282
 
204
283
  Args:
205
- transcript: The text to generate audio for.
206
- duration: The maximum duration of the audio in seconds.
207
- chunk_time: How long each audio segment should be in seconds.
284
+ transcript (str): The text to generate audio for.
285
+ voice (Embedding (List[float])): The voice to use for generating audio.
286
+ duration (int, optional): The maximum duration of the audio in seconds.
287
+ chunk_time (float, optional): How long each audio segment should be in seconds.
208
288
  This should not need to be adjusted.
209
- voice: The voice to use for generating audio.
210
- This can either be a voice id (string) or an embedding vector (List[float]).
211
- stream: Whether to stream the audio or not.
212
- If ``True`` this function returns a generator.
213
- websocket: Whether to use a websocket for streaming audio.
214
- Using the websocket reduces latency by pre-poning the handshake.
289
+ stream (bool, optional): Whether to stream the audio or not.
290
+ If True this function returns a generator. False by default.
291
+ websocket (bool, optional): Whether to use a websocket for streaming audio.
292
+ Using the websocket reduces latency by pre-poning the handshake. True by default.
215
293
 
216
294
  Returns:
217
295
  A generator if `stream` is True, otherwise a dictionary.
@@ -221,19 +299,18 @@ class CartesiaTTS:
221
299
  """
222
300
  self._check_inputs(transcript, duration, chunk_time)
223
301
 
224
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
225
-
226
- optional_body = dict(
227
- duration=duration,
228
- chunk_time=chunk_time,
229
- voice=voice,
302
+ body = self._generate_request_body(
303
+ transcript=transcript,
304
+ voice=voice,
305
+ model_id=model_id,
306
+ duration=duration,
307
+ chunk_time=chunk_time
230
308
  )
231
- body.update({k: v for k, v in optional_body.items() if v is not None})
232
309
 
233
310
  if websocket:
234
311
  generator = self._generate_ws(body)
235
312
  else:
236
- generator = self._generate_http(body)
313
+ generator = self._generate_http_wrapper(body)
237
314
 
238
315
  if stream:
239
316
  return generator
@@ -247,30 +324,32 @@ class CartesiaTTS:
247
324
 
248
325
  return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
249
326
 
327
+ @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
328
+ def _generate_http_wrapper(self, body: Dict[str, Any]):
329
+ """Need to wrap the http generator in a function for the retry decorator to work."""
330
+ try:
331
+ for chunk in self._generate_http(body):
332
+ yield chunk
333
+ except Exception as e:
334
+ logger.error(f"Failed to generate audio. {e}")
335
+ raise e
336
+
250
337
  def _generate_http(self, body: Dict[str, Any]):
251
338
  response = requests.post(
252
339
  f"{self._http_url()}/audio/stream",
253
340
  stream=True,
254
341
  data=json.dumps(body),
255
342
  headers=self.headers,
343
+ timeout=(DEFAULT_TIMEOUT, DEFAULT_TIMEOUT),
256
344
  )
257
- if response.status_code != 200:
345
+ if not response.ok:
258
346
  raise ValueError(f"Failed to generate audio. {response.text}")
259
347
 
260
348
  buffer = ""
261
349
  for chunk_bytes in response.iter_content(chunk_size=None):
262
- buffer += chunk_bytes.decode("utf-8")
263
- while "{" in buffer and "}" in buffer:
264
- start_index = buffer.find("{")
265
- end_index = buffer.find("}", start_index)
266
- if start_index != -1 and end_index != -1:
267
- try:
268
- chunk_json = json.loads(buffer[start_index : end_index + 1])
269
- audio = base64.b64decode(chunk_json["data"])
270
- yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
271
- buffer = buffer[end_index + 1 :]
272
- except json.JSONDecodeError:
273
- break
350
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
351
+ for output in outputs:
352
+ yield output
274
353
 
275
354
  if buffer:
276
355
  try:
@@ -280,21 +359,77 @@ class CartesiaTTS:
280
359
  except json.JSONDecodeError:
281
360
  pass
282
361
 
283
- def _generate_ws(self, body: Dict[str, Any]):
362
+ def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
363
+ """Generate audio using the websocket connection.
364
+
365
+ Args:
366
+ body: The request body.
367
+ context_id: The context id for the request.
368
+ The context id must be globally unique for the duration this client exists.
369
+ If this is provided, the context id that is in the response will
370
+ also be returned as part of the dict. This is helpful for testing.
371
+ """
284
372
  if not self.websocket or self._is_websocket_closed():
285
373
  self.refresh_websocket()
286
374
 
287
- self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
375
+ include_context_id = bool(context_id)
376
+ if context_id is None:
377
+ context_id = uuid.uuid4().hex
378
+ self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
288
379
  try:
289
- response = json.loads(self.websocket.recv())
290
- while not response["done"]:
291
- audio = base64.b64decode(response["data"])
292
- # print("timing", time.perf_counter() - start)
293
- yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
294
-
380
+ while True:
295
381
  response = json.loads(self.websocket.recv())
296
- except Exception:
297
- raise RuntimeError(f"Failed to generate audio. {response}")
382
+ if response["done"]:
383
+ break
384
+
385
+ yield convert_response(response, include_context_id)
386
+
387
+ if self.experimental_ws_handle_interrupts:
388
+ self.websocket.send(json.dumps({"context_id": context_id}))
389
+ except GeneratorExit:
390
+ # The exit is only called when the generator is garbage collected.
391
+ # It may not be called directly after a break statement.
392
+ # However, the generator will be automatically cancelled on the next request.
393
+ if self.experimental_ws_handle_interrupts:
394
+ self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
395
+ except Exception as e:
396
+ # Close the websocket connection if an error occurs.
397
+ if self.websocket and not self._is_websocket_closed():
398
+ self.websocket.close()
399
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
400
+ finally:
401
+ # Ensure the websocket is ultimately closed.
402
+ if self.websocket and not self._is_websocket_closed():
403
+ self.websocket.close()
404
+
405
+ @retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
406
+ def transcribe(self, raw_audio: Union[bytes, str]) -> str:
407
+ raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
408
+ response = httpx.post(
409
+ f"{self._http_url()}/audio/transcriptions",
410
+ headers=headers,
411
+ files={"clip": ("input.wav", raw_audio_bytes)},
412
+ timeout=DEFAULT_TIMEOUT,
413
+ )
414
+
415
+ if not response.is_success:
416
+ raise ValueError(f"Failed to transcribe audio. Error: {response.text()}")
417
+
418
+ transcript = response.json()
419
+ return transcript["text"]
420
+
421
+
422
+ def prepare_audio_and_headers(
423
+ self, raw_audio: Union[bytes, str]
424
+ ) -> Tuple[bytes, Dict[str, Any]]:
425
+ if isinstance(raw_audio, str):
426
+ with open(raw_audio, "rb") as f:
427
+ raw_audio_bytes = f.read()
428
+ else:
429
+ raw_audio_bytes = raw_audio
430
+ # application/json is not the right content type for this request
431
+ headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
432
+ return raw_audio_bytes, headers
298
433
 
299
434
  def _http_url(self):
300
435
  prefix = "http" if "localhost" in self.base_url else "https"
@@ -304,6 +439,235 @@ class CartesiaTTS:
304
439
  prefix = "ws" if "localhost" in self.base_url else "wss"
305
440
  return f"{prefix}://{self.base_url}/{self.api_version}"
306
441
 
307
- def __del__(self):
308
- if self.websocket.socket.fileno() > -1:
442
+ def close(self):
443
+ if self.websocket and not self._is_websocket_closed():
309
444
  self.websocket.close()
445
+
446
+ def __del__(self):
447
+ self.close()
448
+
449
+ def __enter__(self):
450
+ self.refresh_websocket()
451
+ return self
452
+
453
+ def __exit__(
454
+ self,
455
+ exc_type: Union[type, None],
456
+ exc: Union[BaseException, None],
457
+ exc_tb: Union[TracebackType, None],
458
+ ):
459
+ self.close()
460
+
461
+
462
+ class AsyncCartesiaTTS(CartesiaTTS):
463
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
464
+ self._session = None
465
+ self._loop = None
466
+ super().__init__(
467
+ api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
468
+ )
469
+
470
+ async def _get_session(self):
471
+ current_loop = asyncio.get_event_loop()
472
+ if self._loop is not current_loop:
473
+ # If the loop has changed, close the session and create a new one.
474
+ await self.close()
475
+ if self._session is None or self._session.closed:
476
+ timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
477
+ connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
478
+ self._session = aiohttp.ClientSession(
479
+ timeout=timeout, connector=connector
480
+ )
481
+ self._loop = current_loop
482
+ return self._session
483
+
484
+ async def refresh_websocket(self):
485
+ """Refresh the websocket connection."""
486
+ if self.websocket is None or self._is_websocket_closed():
487
+ route = "audio/websocket"
488
+ if self.experimental_ws_handle_interrupts:
489
+ route = f"experimental/{route}"
490
+ session = await self._get_session()
491
+ self.websocket = await session.ws_connect(
492
+ f"{self._ws_url()}/{route}?api_key={self.api_key}"
493
+ )
494
+
495
+ def _is_websocket_closed(self):
496
+ return self.websocket.closed
497
+
498
+ async def close(self):
499
+ """This method closes the websocket and the session.
500
+
501
+ It is *strongly* recommended to call this method when you are done using the client.
502
+ """
503
+ if self.websocket is not None and not self._is_websocket_closed():
504
+ await self.websocket.close()
505
+ if self._session is not None and not self._session.closed:
506
+ await self._session.close()
507
+
508
+ async def generate(
509
+ self,
510
+ *,
511
+ transcript: str,
512
+ voice: Embedding,
513
+ model_id: str = DEFAULT_MODEL_ID,
514
+ duration: int = None,
515
+ chunk_time: float = None,
516
+ stream: bool = False,
517
+ websocket: bool = True,
518
+ ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
519
+ """Asynchronously generate audio from a transcript.
520
+ NOTE: This overrides the non-asynchronous generate method from the base class.
521
+
522
+ Args:
523
+ transcript (str): The text to generate audio for.
524
+ voice (Embedding (List[float])): The voice to use for generating audio.
525
+ duration (int, optional): The maximum duration of the audio in seconds.
526
+ chunk_time (float, optional): How long each audio segment should be in seconds.
527
+ This should not need to be adjusted.
528
+ stream (bool, optional): Whether to stream the audio or not.
529
+ If True this function returns a generator. False by default.
530
+ websocket (bool, optional): Whether to use a websocket for streaming audio.
531
+ Using the websocket reduces latency by pre-poning the handshake. True by default.
532
+
533
+ Returns:
534
+ A generator if `stream` is True, otherwise a dictionary.
535
+ Dictionary from both generator and non-generator return types have the following keys:
536
+ * "audio": The audio as a bytes buffer.
537
+ * "sampling_rate": The sampling rate of the audio.
538
+ """
539
+ self._check_inputs(transcript, duration, chunk_time)
540
+
541
+ body = self._generate_request_body(
542
+ transcript=transcript,
543
+ voice=voice,
544
+ model_id=model_id,
545
+ duration=duration,
546
+ chunk_time=chunk_time
547
+ )
548
+
549
+ if websocket:
550
+ generator = self._generate_ws(body)
551
+ else:
552
+ generator = self._generate_http_wrapper(body)
553
+
554
+ if stream:
555
+ return generator
556
+
557
+ chunks = []
558
+ sampling_rate = None
559
+ async for chunk in generator:
560
+ if sampling_rate is None:
561
+ sampling_rate = chunk["sampling_rate"]
562
+ chunks.append(chunk["audio"])
563
+
564
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
565
+
566
+ @retry_on_connection_error_async(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
567
+ async def _generate_http_wrapper(self, body: Dict[str, Any]):
568
+ """Need to wrap the http generator in a function for the retry decorator to work."""
569
+ try:
570
+ async for chunk in self._generate_http(body):
571
+ yield chunk
572
+ except Exception as e:
573
+ logger.error(f"Failed to generate audio. {e}")
574
+ raise e
575
+
576
+ async def _generate_http(self, body: Dict[str, Any]):
577
+ session = await self._get_session()
578
+ async with session.post(
579
+ f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
580
+ ) as response:
581
+ if not response.ok:
582
+ raise ValueError(f"Failed to generate audio. {await response.text()}")
583
+
584
+ buffer = ""
585
+ async for chunk_bytes in response.content.iter_any():
586
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
587
+ for output in outputs:
588
+ yield output
589
+
590
+ if buffer:
591
+ try:
592
+ chunk_json = json.loads(buffer)
593
+ audio = base64.b64decode(chunk_json["data"])
594
+ yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
595
+ except json.JSONDecodeError:
596
+ pass
597
+
598
+ async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
599
+ include_context_id = bool(context_id)
600
+ route = "audio/websocket"
601
+ if self.experimental_ws_handle_interrupts:
602
+ route = f"experimental/{route}"
603
+
604
+ if not self.websocket or self._is_websocket_closed():
605
+ await self.refresh_websocket()
606
+
607
+ ws = self.websocket
608
+ if context_id is None:
609
+ context_id = uuid.uuid4().hex
610
+ await ws.send_json({"data": body, "context_id": context_id})
611
+ try:
612
+ response = None
613
+ while True:
614
+ response = await ws.receive_json()
615
+ if response["done"]:
616
+ break
617
+
618
+ yield convert_response(response, include_context_id)
619
+
620
+ if self.experimental_ws_handle_interrupts:
621
+ await ws.send_json({"context_id": context_id})
622
+ except GeneratorExit:
623
+ # The exit is only called when the generator is garbage collected.
624
+ # It may not be called directly after a break statement.
625
+ # However, the generator will be automatically cancelled on the next request.
626
+ if self.experimental_ws_handle_interrupts:
627
+ await ws.send_json({"context_id": context_id, "action": "cancel"})
628
+ except Exception as e:
629
+ if self.websocket and not self._is_websocket_closed():
630
+ await self.websocket.close()
631
+ raise RuntimeError(f"Failed to generate audio. {await response.text()}") from e
632
+ finally:
633
+ # Ensure the websocket is ultimately closed.
634
+ if self.websocket and not self._is_websocket_closed():
635
+ await self.websocket.close()
636
+
637
+ async def transcribe(self, raw_audio: Union[bytes, str]) -> str:
638
+ raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
639
+ data = aiohttp.FormData()
640
+ data.add_field("clip", raw_audio_bytes, filename="input.wav", content_type="audio/wav")
641
+ session = await self._get_session()
642
+
643
+ async with session.post(
644
+ f"{self._http_url()}/audio/transcriptions", headers=headers, data=data
645
+ ) as response:
646
+ if not response.ok:
647
+ raise ValueError(f"Failed to transcribe audio. Error: {await response.text()}")
648
+
649
+ transcript = await response.json()
650
+ return transcript["text"]
651
+
652
+ def __del__(self):
653
+ try:
654
+ loop = asyncio.get_running_loop()
655
+ except RuntimeError:
656
+ loop = None
657
+
658
+ if loop is None:
659
+ asyncio.run(self.close())
660
+ else:
661
+ loop.create_task(self.close())
662
+
663
+ async def __aenter__(self):
664
+ await self.refresh_websocket()
665
+ return self
666
+
667
+ async def __aexit__(
668
+ self,
669
+ exc_type: Union[type, None],
670
+ exc: Union[BaseException, None],
671
+ exc_tb: Union[TracebackType, None],
672
+ ):
673
+ await self.close()
cartesia/utils.py ADDED
@@ -0,0 +1,65 @@
1
+ import time
2
+
3
+ from aiohttp.client_exceptions import ServerDisconnectedError
4
+ import asyncio
5
+ from functools import wraps
6
+ from http.client import RemoteDisconnected
7
+ from httpx import TimeoutException
8
+ from requests.exceptions import ConnectionError
9
+
10
+ def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
11
+ """Retry a function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
12
+
13
+ Args:
14
+ max_retries (int): The maximum number of retries.
15
+ backoff_factor (int): The factor to increase the delay between retries.
16
+ logger (logging.Logger): The logger to use for logging.
17
+ """
18
+ def decorator(func):
19
+ @wraps(func)
20
+ def wrapper(*args, **kwargs):
21
+ retry_count = 0
22
+ while retry_count < max_retries:
23
+ try:
24
+ return func(*args, **kwargs)
25
+ except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
26
+ logger.info(f"Retrying after exception: {e}")
27
+ retry_count += 1
28
+ if retry_count < max_retries:
29
+ delay = backoff_factor * (2 ** (retry_count - 1))
30
+ logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
31
+ time.sleep(delay)
32
+ else:
33
+ raise Exception(f"Exception occurred after {max_retries} tries.") from e
34
+ return wrapper
35
+ return decorator
36
+
37
+ def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None):
38
+ """Retry an asynchronous function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
39
+
40
+ Args:
41
+ max_retries (int): The maximum number of retries.
42
+ backoff_factor (int): The factor to increase the delay between retries.
43
+ logger (logging.Logger): The logger to use for logging.
44
+ """
45
+ def decorator(func):
46
+ @wraps(func)
47
+ async def wrapper(*args, **kwargs):
48
+ retry_count = 0
49
+ while retry_count < max_retries:
50
+ try:
51
+ async for chunk in func(*args, **kwargs):
52
+ yield chunk
53
+ # If the function completes without raising an exception return
54
+ return
55
+ except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
56
+ logger.info(f"Retrying after exception: {e}")
57
+ retry_count += 1
58
+ if retry_count < max_retries:
59
+ delay = backoff_factor * (2 ** (retry_count - 1))
60
+ logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
61
+ await asyncio.sleep(delay)
62
+ else:
63
+ raise Exception(f"Exception occurred after {max_retries} tries.") from e
64
+ return wrapper
65
+ return decorator
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.3"
1
+ __version__ = "0.0.5"
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.1
2
+ Name: cartesia
3
+ Version: 0.0.5
4
+ Summary: The official Python library for the Cartesia API.
5
+ Home-page:
6
+ Author: Cartesia, Inc.
7
+ Author-email: support@cartesia.ai
8
+ Classifier: Programming Language :: Python
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Requires-Python: >=3.8.0
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: aiohttp
14
+ Requires-Dist: httpx
15
+ Requires-Dist: pytest-asyncio
16
+ Requires-Dist: requests
17
+ Requires-Dist: websockets
18
+ Provides-Extra: all
19
+ Requires-Dist: pytest >=8.0.2 ; extra == 'all'
20
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
21
+ Requires-Dist: twine ; extra == 'all'
22
+ Requires-Dist: setuptools ; extra == 'all'
23
+ Requires-Dist: wheel ; extra == 'all'
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
26
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
27
+ Requires-Dist: twine ; extra == 'dev'
28
+ Requires-Dist: setuptools ; extra == 'dev'
29
+ Requires-Dist: wheel ; extra == 'dev'
30
+
31
+
32
+ # Cartesia Python API Library
33
+ The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
34
+
35
+ **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
36
+
37
+ ## Installation
38
+ ```bash
39
+ pip install cartesia
40
+
41
+ # pip install in editable mode w/ dev dependencies
42
+ pip install -e '.[dev]'
43
+ ```
44
+
45
+ ## Usage
46
+ ```python
47
+ from cartesia.tts import CartesiaTTS
48
+ import pyaudio
49
+ import os
50
+
51
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
52
+ voices = client.get_voices()
53
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
54
+ transcript = "Hello! Welcome to Cartesia"
55
+ model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
56
+
57
+ p = pyaudio.PyAudio()
58
+
59
+ stream = None
60
+
61
+ # Generate and stream audio
62
+ for output in client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
63
+ buffer = output["audio"]
64
+ rate = output["sampling_rate"]
65
+
66
+ if not stream:
67
+ stream = p.open(format=pyaudio.paFloat32,
68
+ channels=1,
69
+ rate=rate,
70
+ output=True)
71
+
72
+ # Write the audio data to the stream
73
+ stream.write(buffer)
74
+
75
+ stream.stop_stream()
76
+ stream.close()
77
+ p.terminate()
78
+ ```
79
+
80
+ You can also use the async client if you want to make asynchronous API calls:
81
+ ```python
82
+ from cartesia.tts import AsyncCartesiaTTS
83
+ import asyncio
84
+ import pyaudio
85
+ import os
86
+
87
+ async def write_stream():
88
+ client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
89
+ voices = client.get_voices()
90
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
91
+ transcript = "Hello! Welcome to Cartesia"
92
+ model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
93
+
94
+ p = pyaudio.PyAudio()
95
+
96
+ stream = None
97
+
98
+ # Generate and stream audio
99
+ async for output in await client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
100
+ buffer = output["audio"]
101
+ rate = output["sampling_rate"]
102
+
103
+ if not stream:
104
+ stream = p.open(format=pyaudio.paFloat32,
105
+ channels=1,
106
+ rate=rate,
107
+ output=True)
108
+
109
+ # Write the audio data to the stream
110
+ stream.write(buffer)
111
+
112
+ stream.stop_stream()
113
+ stream.close()
114
+ p.terminate()
115
+
116
+ asyncio.run(write_stream())
117
+ ```
118
+
119
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
120
+ Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
121
+
122
+ ```python
123
+ from IPython.display import Audio
124
+ import io
125
+ import os
126
+ import numpy as np
127
+
128
+ from cartesia.tts import CartesiaTTS
129
+
130
+ with CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
131
+ voices = client.get_voices()
132
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
133
+ transcript = "Hello! Welcome to Cartesia"
134
+
135
+ # Create a BytesIO object to store the audio data
136
+ audio_data = io.BytesIO()
137
+
138
+ # Generate and stream audio
139
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
140
+ buffer = output["audio"]
141
+ audio_data.write(buffer)
142
+
143
+ # Set the cursor position to the beginning of the BytesIO object
144
+ audio_data.seek(0)
145
+
146
+ # Create an Audio object from the BytesIO data
147
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
148
+
149
+ # Display the Audio object
150
+ display(audio)
151
+ ```
152
+
153
+ Below is the same example using the async client:
154
+ ```python
155
+ from IPython.display import Audio
156
+ import io
157
+ import os
158
+ import numpy as np
159
+
160
+ from cartesia.tts import AsyncCartesiaTTS
161
+
162
+ async with AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
163
+ voices = client.get_voices()
164
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
165
+ transcript = "Hello! Welcome to Cartesia"
166
+
167
+ # Create a BytesIO object to store the audio data
168
+ audio_data = io.BytesIO()
169
+
170
+ # Generate and stream audio
171
+ async for output in await client.generate(transcript=transcript, voice=voice, stream=True):
172
+ buffer = output["audio"]
173
+ audio_data.write(buffer)
174
+
175
+ # Set the cursor position to the beginning of the BytesIO object
176
+ audio_data.seek(0)
177
+
178
+ # Create an Audio object from the BytesIO data
179
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
180
+
181
+ # Display the Audio object
182
+ display(audio)
183
+ ```
184
+
185
+ To avoid storing your API key in the source code, we recommend doing one of the following:
186
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
187
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -0,0 +1,8 @@
1
+ cartesia/__init__.py,sha256=uIc9xGNPs8_A6eAvbTUY1geazunYoEZVWFKhCwC9TRA,102
2
+ cartesia/tts.py,sha256=Gtm9qse83g3SX4-KbmlxOAvTQcZIjmUkMfBKu2Xf9rY,26449
3
+ cartesia/utils.py,sha256=GoTJe8LZ3WpS4hXkwoZauPYjo7Mbx7BvbBjAX5vEbwg,3024
4
+ cartesia/version.py,sha256=S7u1lbuWmM3A3ajykBialmPoJUK6Jg-WmNqM-9OZFdk,22
5
+ cartesia-0.0.5.dist-info/METADATA,sha256=oK64bcTyLhrosXh9FjuEwB2SUdQzbYsxzOWCnf6qaI4,5974
6
+ cartesia-0.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
7
+ cartesia-0.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
8
+ cartesia-0.0.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
@@ -1,113 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: cartesia
3
- Version: 0.0.3
4
- Summary: The official Python library for the Cartesia API.
5
- Home-page:
6
- Author: Cartesia, Inc.
7
- Author-email: support@cartesia.ai
8
- Classifier: Programming Language :: Python
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
- Requires-Python: >=3.8.0
12
- Description-Content-Type: text/markdown
13
- Requires-Dist: websockets
14
- Requires-Dist: requests
15
- Provides-Extra: all
16
- Requires-Dist: pre-commit ; extra == 'all'
17
- Requires-Dist: docformatter ; extra == 'all'
18
- Requires-Dist: black ==24.1.1 ; extra == 'all'
19
- Requires-Dist: isort ==5.13.2 ; extra == 'all'
20
- Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
21
- Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
22
- Requires-Dist: pytest >=8.0.2 ; extra == 'all'
23
- Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
24
- Provides-Extra: dev
25
- Requires-Dist: pre-commit ; extra == 'dev'
26
- Requires-Dist: docformatter ; extra == 'dev'
27
- Requires-Dist: black ==24.1.1 ; extra == 'dev'
28
- Requires-Dist: isort ==5.13.2 ; extra == 'dev'
29
- Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
30
- Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
31
- Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
32
- Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
33
-
34
-
35
- # Cartesia Python API Library
36
- The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
37
-
38
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
39
-
40
- ## Installation
41
- ```bash
42
- pip install cartesia
43
-
44
- # pip install in editable mode w/ dev dependencies
45
- pip install -e '.[dev]'
46
- ```
47
-
48
- ## Usage
49
- ```python
50
- from cartesia.tts import CartesiaTTS
51
- import pyaudio
52
- import os
53
-
54
- client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
- voices = client.get_voices()
56
- voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
57
- transcript = "Hello! Welcome to Cartesia"
58
-
59
- p = pyaudio.PyAudio()
60
-
61
- stream = None
62
-
63
- # Generate and stream audio
64
- for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
- buffer = output["audio"]
66
- rate = output["sampling_rate"]
67
-
68
- if not stream:
69
- stream = p.open(format=pyaudio.paFloat32,
70
- channels=1,
71
- rate=rate,
72
- output=True)
73
-
74
- # Write the audio data to the stream
75
- stream.write(buffer)
76
-
77
- stream.stop_stream()
78
- stream.close()
79
- p.terminate()
80
- ```
81
-
82
- If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
-
84
- ```python
85
- from cartesia.tts import CartesiaTTS
86
- from IPython.display import Audio
87
- import io
88
- import os
89
-
90
- client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
- voices = client.get_voices()
92
- voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
- transcript = "Hello! Welcome to Cartesia"
94
-
95
- # Create a BytesIO object to store the audio data
96
- audio_data = io.BytesIO()
97
-
98
- # Generate and stream audio
99
- for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
- buffer = output["audio"]
101
- audio_data.write(buffer)
102
-
103
- # Set the cursor position to the beginning of the BytesIO object
104
- audio_data.seek(0)
105
-
106
- # Create an Audio object from the BytesIO data
107
- audio = Audio(audio_data, rate=output["sampling_rate"])
108
-
109
- # Display the Audio object
110
- display(audio)
111
- ```
112
-
113
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -1,7 +0,0 @@
1
- cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
- cartesia/tts.py,sha256=ABXW9rc8Pn0GTRvb_7DHZKMtbvhGUiqOgHmvztwlOnI,12033
3
- cartesia/version.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
4
- cartesia-0.0.3.dist-info/METADATA,sha256=VsCGL1sITbKqERihK2rzVm9WIY5EJ5nCS_CXQ0s14ns,3604
5
- cartesia-0.0.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
- cartesia-0.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
- cartesia-0.0.3.dist-info/RECORD,,