cartesia 0.0.3__py2.py3-none-any.whl → 0.0.5rc1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/tts.py CHANGED
@@ -1,15 +1,20 @@
1
+ import asyncio
1
2
  import base64
2
3
  import json
3
4
  import os
4
5
  import uuid
5
- from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
+ from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
6
7
 
8
+ import aiohttp
9
+ import httpx
7
10
  import requests
8
11
  from websockets.sync.client import connect
9
12
 
10
13
  DEFAULT_MODEL_ID = "genial-planet-1346"
11
14
  DEFAULT_BASE_URL = "api.cartesia.ai"
12
15
  DEFAULT_API_VERSION = "v0"
16
+ DEFAULT_TIMEOUT = 60 # seconds
17
+ DEFAULT_NUM_CONNECTIONS = 10 # connections per client
13
18
 
14
19
 
15
20
  class AudioOutput(TypedDict):
@@ -27,11 +32,46 @@ class VoiceMetadata(TypedDict):
27
32
  embedding: Optional[Embedding]
28
33
 
29
34
 
35
+ def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
36
+ buffer += chunk_bytes.decode("utf-8")
37
+ outputs = []
38
+ while "{" in buffer and "}" in buffer:
39
+ start_index = buffer.find("{")
40
+ end_index = buffer.find("}", start_index)
41
+ if start_index != -1 and end_index != -1:
42
+ try:
43
+ chunk_json = json.loads(buffer[start_index : end_index + 1])
44
+ audio = base64.b64decode(chunk_json["data"])
45
+ outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
46
+ buffer = buffer[end_index + 1 :]
47
+ except json.JSONDecodeError:
48
+ break
49
+ return buffer, outputs
50
+
51
+
52
+ def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
53
+ audio = base64.b64decode(response["data"])
54
+
55
+ optional_kwargs = {}
56
+ if include_context_id:
57
+ optional_kwargs["context_id"] = response["context_id"]
58
+
59
+ return {
60
+ "audio": audio,
61
+ "sampling_rate": response["sampling_rate"],
62
+ **optional_kwargs,
63
+ }
64
+
65
+
30
66
  class CartesiaTTS:
31
67
  """The client for Cartesia's text-to-speech library.
32
68
 
33
69
  This client contains methods to interact with the Cartesia text-to-speech API.
34
- The API offers
70
+ The client can be used to retrieve available voices, compute new voice embeddings,
71
+ and generate speech from text.
72
+
73
+ The client also supports generating audio using a websocket for lower latency.
74
+ To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
35
75
 
36
76
  Examples:
37
77
 
@@ -55,18 +95,22 @@ class CartesiaTTS:
55
95
  ... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
56
96
  """
57
97
 
58
- def __init__(self, *, api_key: str = None):
98
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
59
99
  """
60
100
  Args:
61
101
  api_key: The API key to use for authorization.
62
102
  If not specified, the API key will be read from the environment variable
63
103
  `CARTESIA_API_KEY`.
104
+ experimental_ws_handle_interrupts: Whether to handle interrupts when generating
105
+ audio using the websocket. This is an experimental feature and may have bugs
106
+ or be deprecated in the future.
64
107
  """
65
108
  self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
66
109
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
67
110
  self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
68
111
  self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
69
112
  self.websocket = None
113
+ self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
70
114
  self.refresh_websocket()
71
115
 
72
116
  def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
@@ -100,9 +144,9 @@ class CartesiaTTS:
100
144
  >>> audio = client.generate(transcript="Hello world!", voice=embedding)
101
145
  """
102
146
  params = {"select": "id, name, description"} if skip_embeddings else None
103
- response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
147
+ response = httpx.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
104
148
 
105
- if response.status_code != 200:
149
+ if not response.is_success:
106
150
  raise ValueError(f"Failed to get voices. Error: {response.text}")
107
151
 
108
152
  voices = response.json()
@@ -134,20 +178,20 @@ class CartesiaTTS:
134
178
 
135
179
  if voice_id:
136
180
  url = f"{self._http_url()}/voices/embedding/{voice_id}"
137
- response = requests.get(url, headers=self.headers)
181
+ response = httpx.get(url, headers=self.headers)
138
182
  elif filepath:
139
183
  url = f"{self._http_url()}/voices/clone/clip"
140
184
  files = {"clip": open(filepath, "rb")}
141
185
  headers = self.headers.copy()
142
186
  # The default content type of JSON is incorrect for file uploads
143
187
  headers.pop("Content-Type")
144
- response = requests.post(url, headers=headers, files=files)
188
+ response = httpx.post(url, headers=headers, files=files)
145
189
  elif link:
146
190
  url = f"{self._http_url()}/voices/clone/url"
147
191
  params = {"link": link}
148
- response = requests.post(url, headers=self.headers, params=params)
192
+ response = httpx.post(url, headers=self.headers, params=params)
149
193
 
150
- if response.status_code != 200:
194
+ if not response.is_success:
151
195
  raise ValueError(
152
196
  f"Failed to clone voice. Status Code: {response.status_code}\n"
153
197
  f"Error: {response.text}"
@@ -167,8 +211,11 @@ class CartesiaTTS:
167
211
  """
168
212
  if self.websocket and not self._is_websocket_closed():
169
213
  self.websocket.close()
214
+ route = "audio/websocket"
215
+ if self.experimental_ws_handle_interrupts:
216
+ route = f"experimental/{route}"
170
217
  self.websocket = connect(
171
- f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
218
+ f"{self._ws_url()}/{route}?api_key={self.api_key}",
172
219
  close_timeout=None,
173
220
  )
174
221
 
@@ -189,6 +236,29 @@ class CartesiaTTS:
189
236
  if transcript.strip() == "":
190
237
  raise ValueError("`transcript` must be non empty")
191
238
 
239
+ def _generate_request_body(
240
+ self,
241
+ *,
242
+ transcript: str,
243
+ duration: int = None,
244
+ chunk_time: float = None,
245
+ voice: Embedding = None,
246
+ ) -> Dict[str, Any]:
247
+ """
248
+ Create the request body for a stream request.
249
+ Note that anything that's not provided will use a default if available or be filtered out otherwise.
250
+ """
251
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=voice)
252
+
253
+ optional_body = dict(
254
+ duration=duration,
255
+ chunk_time=chunk_time,
256
+ voice=voice,
257
+ )
258
+ body.update({k: v for k, v in optional_body.items() if v is not None})
259
+
260
+ return body
261
+
192
262
  def generate(
193
263
  self,
194
264
  *,
@@ -221,14 +291,9 @@ class CartesiaTTS:
221
291
  """
222
292
  self._check_inputs(transcript, duration, chunk_time)
223
293
 
224
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
225
-
226
- optional_body = dict(
227
- duration=duration,
228
- chunk_time=chunk_time,
229
- voice=voice,
294
+ body = self._generate_request_body(
295
+ transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
230
296
  )
231
- body.update({k: v for k, v in optional_body.items() if v is not None})
232
297
 
233
298
  if websocket:
234
299
  generator = self._generate_ws(body)
@@ -254,23 +319,14 @@ class CartesiaTTS:
254
319
  data=json.dumps(body),
255
320
  headers=self.headers,
256
321
  )
257
- if response.status_code != 200:
322
+ if not response.ok:
258
323
  raise ValueError(f"Failed to generate audio. {response.text}")
259
324
 
260
325
  buffer = ""
261
326
  for chunk_bytes in response.iter_content(chunk_size=None):
262
- buffer += chunk_bytes.decode("utf-8")
263
- while "{" in buffer and "}" in buffer:
264
- start_index = buffer.find("{")
265
- end_index = buffer.find("}", start_index)
266
- if start_index != -1 and end_index != -1:
267
- try:
268
- chunk_json = json.loads(buffer[start_index : end_index + 1])
269
- audio = base64.b64decode(chunk_json["data"])
270
- yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
271
- buffer = buffer[end_index + 1 :]
272
- except json.JSONDecodeError:
273
- break
327
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
328
+ for output in outputs:
329
+ yield output
274
330
 
275
331
  if buffer:
276
332
  try:
@@ -280,21 +336,41 @@ class CartesiaTTS:
280
336
  except json.JSONDecodeError:
281
337
  pass
282
338
 
283
- def _generate_ws(self, body: Dict[str, Any]):
339
+ def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
340
+ """Generate audio using the websocket connection.
341
+
342
+ Args:
343
+ body: The request body.
344
+ context_id: The context id for the request.
345
+ The context id must be globally unique for the duration this client exists.
346
+ If this is provided, the context id that is in the response will
347
+ also be returned as part of the dict. This is helpful for testing.
348
+ """
284
349
  if not self.websocket or self._is_websocket_closed():
285
350
  self.refresh_websocket()
286
351
 
287
- self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
352
+ include_context_id = bool(context_id)
353
+ if context_id is None:
354
+ context_id = uuid.uuid4().hex
355
+ self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
288
356
  try:
289
- response = json.loads(self.websocket.recv())
290
- while not response["done"]:
291
- audio = base64.b64decode(response["data"])
292
- # print("timing", time.perf_counter() - start)
293
- yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
294
-
357
+ while True:
295
358
  response = json.loads(self.websocket.recv())
296
- except Exception:
297
- raise RuntimeError(f"Failed to generate audio. {response}")
359
+ if response["done"]:
360
+ break
361
+
362
+ yield convert_response(response, include_context_id)
363
+
364
+ if self.experimental_ws_handle_interrupts:
365
+ self.websocket.send(json.dumps({"context_id": context_id}))
366
+ except GeneratorExit:
367
+ # The exit is only called when the generator is garbage collected.
368
+ # It may not be called directly after a break statement.
369
+ # However, the generator will be automatically cancelled on the next request.
370
+ if self.experimental_ws_handle_interrupts:
371
+ self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
372
+ except Exception as e:
373
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
298
374
 
299
375
  def _http_url(self):
300
376
  prefix = "http" if "localhost" in self.base_url else "https"
@@ -307,3 +383,143 @@ class CartesiaTTS:
307
383
  def __del__(self):
308
384
  if self.websocket.socket.fileno() > -1:
309
385
  self.websocket.close()
386
+
387
+
388
+ class AsyncCartesiaTTS(CartesiaTTS):
389
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
390
+ self.timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
391
+ self.connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
392
+ self._session = aiohttp.ClientSession(timeout=self.timeout, connector=self.connector)
393
+ super().__init__(
394
+ api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
395
+ )
396
+
397
+ def refresh_websocket(self):
398
+ pass # do not load the websocket for the client until asynchronously when it is needed
399
+
400
+ async def _async_refresh_websocket(self):
401
+ """Refresh the websocket connection."""
402
+ if self.websocket and not self._is_websocket_closed():
403
+ self.websocket.close()
404
+ route = "audio/websocket"
405
+ if self.experimental_ws_handle_interrupts:
406
+ route = f"experimental/{route}"
407
+ self.websocket = await self._session.ws_connect(
408
+ f"{self._ws_url()}/{route}?api_key={self.api_key}"
409
+ )
410
+
411
+ async def generate(
412
+ self,
413
+ *,
414
+ transcript: str,
415
+ duration: int = None,
416
+ chunk_time: float = None,
417
+ voice: Embedding = None,
418
+ stream: bool = False,
419
+ websocket: bool = True,
420
+ ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
421
+ """Asynchronously generate audio from a transcript.
422
+ NOTE: This overrides the non-asynchronous generate method from the base class.
423
+ Args:
424
+ transcript: The text to generate audio for.
425
+ voice: The embedding to use for generating audio.
426
+ options: The options to use for generating audio. See :class:`GenerateOptions`.
427
+ Returns:
428
+ A dictionary containing the following:
429
+ * "audio": The audio as a 1D numpy array.
430
+ * "sampling_rate": The sampling rate of the audio.
431
+ """
432
+ body = self._generate_request_body(
433
+ transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
434
+ )
435
+
436
+ if websocket:
437
+ generator = self._generate_ws(body)
438
+ else:
439
+ generator = self._generate_http(body)
440
+
441
+ if stream:
442
+ return generator
443
+
444
+ chunks = []
445
+ sampling_rate = None
446
+ async for chunk in generator:
447
+ if sampling_rate is None:
448
+ sampling_rate = chunk["sampling_rate"]
449
+ chunks.append(chunk["audio"])
450
+
451
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
452
+
453
+ async def _generate_http(self, body: Dict[str, Any]):
454
+ async with self._session.post(
455
+ f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
456
+ ) as response:
457
+ if response.status < 200 or response.status >= 300:
458
+ raise ValueError(f"Failed to generate audio. {response.text}")
459
+
460
+ buffer = ""
461
+ async for chunk_bytes in response.content.iter_any():
462
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
463
+ for output in outputs:
464
+ yield output
465
+
466
+ if buffer:
467
+ try:
468
+ chunk_json = json.loads(buffer)
469
+ audio = base64.b64decode(chunk_json["data"])
470
+ yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
471
+ except json.JSONDecodeError:
472
+ pass
473
+
474
+ async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
475
+ include_context_id = bool(context_id)
476
+ route = "audio/websocket"
477
+ if self.experimental_ws_handle_interrupts:
478
+ route = f"experimental/{route}"
479
+
480
+ if not self.websocket or self._is_websocket_closed():
481
+ await self._async_refresh_websocket()
482
+
483
+ ws = self.websocket
484
+ if context_id is None:
485
+ context_id = uuid.uuid4().hex
486
+ await ws.send_json({"data": body, "context_id": context_id})
487
+ try:
488
+ response = None
489
+ while True:
490
+ response = await ws.receive_json()
491
+ if response["done"]:
492
+ break
493
+
494
+ yield convert_response(response, include_context_id)
495
+
496
+ if self.experimental_ws_handle_interrupts:
497
+ await ws.send_json({"context_id": context_id})
498
+ except GeneratorExit:
499
+ # The exit is only called when the generator is garbage collected.
500
+ # It may not be called directly after a break statement.
501
+ # However, the generator will be automatically cancelled on the next request.
502
+ if self.experimental_ws_handle_interrupts:
503
+ await ws.send_json({"context_id": context_id, "action": "cancel"})
504
+ except Exception as e:
505
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
506
+
507
+ def _is_websocket_closed(self):
508
+ return self.websocket.closed
509
+
510
+ async def cleanup(self):
511
+ if self.websocket is not None and not self._is_websocket_closed():
512
+ await self.websocket.close()
513
+ if not self._session.closed:
514
+ await self._session.close()
515
+
516
+ def __del__(self):
517
+ try:
518
+ loop = asyncio.get_running_loop()
519
+ except RuntimeError:
520
+ loop = None
521
+
522
+ if loop is None:
523
+ asyncio.run(self.cleanup())
524
+ else:
525
+ loop.create_task(self.cleanup())
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.3"
1
+ __version__ = "0.0.5rc1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.3
3
+ Version: 0.0.5rc1
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -10,8 +10,11 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
11
  Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
- Requires-Dist: websockets
13
+ Requires-Dist: aiohttp
14
+ Requires-Dist: httpx
15
+ Requires-Dist: pytest-asyncio
14
16
  Requires-Dist: requests
17
+ Requires-Dist: websockets
15
18
  Provides-Extra: all
16
19
  Requires-Dist: pre-commit ; extra == 'all'
17
20
  Requires-Dist: docformatter ; extra == 'all'
@@ -21,6 +24,7 @@ Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
21
24
  Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
22
25
  Requires-Dist: pytest >=8.0.2 ; extra == 'all'
23
26
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
27
+ Requires-Dist: twine ; extra == 'all'
24
28
  Provides-Extra: dev
25
29
  Requires-Dist: pre-commit ; extra == 'dev'
26
30
  Requires-Dist: docformatter ; extra == 'dev'
@@ -30,6 +34,7 @@ Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
30
34
  Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
31
35
  Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
32
36
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
37
+ Requires-Dist: twine ; extra == 'dev'
33
38
 
34
39
 
35
40
  # Cartesia Python API Library
@@ -104,10 +109,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
104
109
  audio_data.seek(0)
105
110
 
106
111
  # Create an Audio object from the BytesIO data
107
- audio = Audio(audio_data, rate=output["sampling_rate"])
112
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
113
+
114
+ # Display the Audio object
115
+ display(audio)
116
+ ```
117
+
118
+ You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
119
+ ```python
120
+ from cartesia.tts import AsyncCartesiaTTS
121
+ from IPython.display import Audio
122
+ import io
123
+ import os
124
+
125
+ client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
126
+ voices = client.get_voices()
127
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
128
+ transcript = "Hello! Welcome to Cartesia"
129
+
130
+ # Create a BytesIO object to store the audio data
131
+ audio_data = io.BytesIO()
132
+
133
+ # Generate and stream audio
134
+ async for output in client.generate(transcript=transcript, voice=voice, stream=True):
135
+ buffer = output["audio"]
136
+ audio_data.write(buffer)
137
+
138
+ # Set the cursor position to the beginning of the BytesIO object
139
+ audio_data.seek(0)
140
+
141
+ # Create an Audio object from the BytesIO data
142
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
108
143
 
109
144
  # Display the Audio object
110
145
  display(audio)
111
146
  ```
112
147
 
113
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
148
+ To avoid storing your API key in the source code, we recommend doing one of the following:
149
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
150
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -0,0 +1,7 @@
1
+ cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
+ cartesia/tts.py,sha256=yPLz41AR0oAYPUNW48mqmwEEbLBHCnbaK_wPT0iFBVk,20543
3
+ cartesia/version.py,sha256=VkI5lk2CFatZR200RqGd8cBjTnMDmhtZW7DI6mPe6n4,25
4
+ cartesia-0.0.5rc1.dist-info/METADATA,sha256=632D6iZ2IU3MLySAnMtwV2zQA38XkQv1rfFF4iRdAco,4893
5
+ cartesia-0.0.5rc1.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
+ cartesia-0.0.5rc1.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
+ cartesia-0.0.5rc1.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
- cartesia/tts.py,sha256=ABXW9rc8Pn0GTRvb_7DHZKMtbvhGUiqOgHmvztwlOnI,12033
3
- cartesia/version.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
4
- cartesia-0.0.3.dist-info/METADATA,sha256=VsCGL1sITbKqERihK2rzVm9WIY5EJ5nCS_CXQ0s14ns,3604
5
- cartesia-0.0.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
- cartesia-0.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
- cartesia-0.0.3.dist-info/RECORD,,