cartesia 0.0.3__tar.gz → 0.0.5rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.3
3
+ Version: 0.0.5rc1
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -10,8 +10,11 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
11
  Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
- Requires-Dist: websockets
13
+ Requires-Dist: aiohttp
14
+ Requires-Dist: httpx
15
+ Requires-Dist: pytest-asyncio
14
16
  Requires-Dist: requests
17
+ Requires-Dist: websockets
15
18
  Provides-Extra: dev
16
19
  Requires-Dist: pre-commit; extra == "dev"
17
20
  Requires-Dist: docformatter; extra == "dev"
@@ -21,6 +24,7 @@ Requires-Dist: flake8==7.0.0; extra == "dev"
21
24
  Requires-Dist: flake8-bugbear==24.2.6; extra == "dev"
22
25
  Requires-Dist: pytest>=8.0.2; extra == "dev"
23
26
  Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
27
+ Requires-Dist: twine; extra == "dev"
24
28
  Provides-Extra: all
25
29
  Requires-Dist: pre-commit; extra == "all"
26
30
  Requires-Dist: docformatter; extra == "all"
@@ -30,6 +34,7 @@ Requires-Dist: flake8==7.0.0; extra == "all"
30
34
  Requires-Dist: flake8-bugbear==24.2.6; extra == "all"
31
35
  Requires-Dist: pytest>=8.0.2; extra == "all"
32
36
  Requires-Dist: pytest-cov>=4.1.0; extra == "all"
37
+ Requires-Dist: twine; extra == "all"
33
38
 
34
39
 
35
40
  # Cartesia Python API Library
@@ -104,10 +109,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
104
109
  audio_data.seek(0)
105
110
 
106
111
  # Create an Audio object from the BytesIO data
107
- audio = Audio(audio_data, rate=output["sampling_rate"])
112
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
113
+
114
+ # Display the Audio object
115
+ display(audio)
116
+ ```
117
+
118
+ You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
119
+ ```python
120
+ from cartesia.tts import AsyncCartesiaTTS
121
+ from IPython.display import Audio
122
+ import io
123
+ import os
124
+
125
+ client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
126
+ voices = client.get_voices()
127
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
128
+ transcript = "Hello! Welcome to Cartesia"
129
+
130
+ # Create a BytesIO object to store the audio data
131
+ audio_data = io.BytesIO()
132
+
133
+ # Generate and stream audio
134
+ async for output in client.generate(transcript=transcript, voice=voice, stream=True):
135
+ buffer = output["audio"]
136
+ audio_data.write(buffer)
137
+
138
+ # Set the cursor position to the beginning of the BytesIO object
139
+ audio_data.seek(0)
140
+
141
+ # Create an Audio object from the BytesIO data
142
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
108
143
 
109
144
  # Display the Audio object
110
145
  display(audio)
111
146
  ```
112
147
 
113
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
148
+ To avoid storing your API key in the source code, we recommend doing one of the following:
149
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
150
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -70,10 +70,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
70
70
  audio_data.seek(0)
71
71
 
72
72
  # Create an Audio object from the BytesIO data
73
- audio = Audio(audio_data, rate=output["sampling_rate"])
73
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
74
74
 
75
75
  # Display the Audio object
76
76
  display(audio)
77
77
  ```
78
78
 
79
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
79
+ You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
80
+ ```python
81
+ from cartesia.tts import AsyncCartesiaTTS
82
+ from IPython.display import Audio
83
+ import io
84
+ import os
85
+
86
+ client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
87
+ voices = client.get_voices()
88
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
89
+ transcript = "Hello! Welcome to Cartesia"
90
+
91
+ # Create a BytesIO object to store the audio data
92
+ audio_data = io.BytesIO()
93
+
94
+ # Generate and stream audio
95
+ async for output in client.generate(transcript=transcript, voice=voice, stream=True):
96
+ buffer = output["audio"]
97
+ audio_data.write(buffer)
98
+
99
+ # Set the cursor position to the beginning of the BytesIO object
100
+ audio_data.seek(0)
101
+
102
+ # Create an Audio object from the BytesIO data
103
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
104
+
105
+ # Display the Audio object
106
+ display(audio)
107
+ ```
108
+
109
+ To avoid storing your API key in the source code, we recommend doing one of the following:
110
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
111
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -1,15 +1,20 @@
1
+ import asyncio
1
2
  import base64
2
3
  import json
3
4
  import os
4
5
  import uuid
5
- from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
+ from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
6
7
 
8
+ import aiohttp
9
+ import httpx
7
10
  import requests
8
11
  from websockets.sync.client import connect
9
12
 
10
13
  DEFAULT_MODEL_ID = "genial-planet-1346"
11
14
  DEFAULT_BASE_URL = "api.cartesia.ai"
12
15
  DEFAULT_API_VERSION = "v0"
16
+ DEFAULT_TIMEOUT = 60 # seconds
17
+ DEFAULT_NUM_CONNECTIONS = 10 # connections per client
13
18
 
14
19
 
15
20
  class AudioOutput(TypedDict):
@@ -27,11 +32,46 @@ class VoiceMetadata(TypedDict):
27
32
  embedding: Optional[Embedding]
28
33
 
29
34
 
35
+ def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
36
+ buffer += chunk_bytes.decode("utf-8")
37
+ outputs = []
38
+ while "{" in buffer and "}" in buffer:
39
+ start_index = buffer.find("{")
40
+ end_index = buffer.find("}", start_index)
41
+ if start_index != -1 and end_index != -1:
42
+ try:
43
+ chunk_json = json.loads(buffer[start_index : end_index + 1])
44
+ audio = base64.b64decode(chunk_json["data"])
45
+ outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
46
+ buffer = buffer[end_index + 1 :]
47
+ except json.JSONDecodeError:
48
+ break
49
+ return buffer, outputs
50
+
51
+
52
+ def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
53
+ audio = base64.b64decode(response["data"])
54
+
55
+ optional_kwargs = {}
56
+ if include_context_id:
57
+ optional_kwargs["context_id"] = response["context_id"]
58
+
59
+ return {
60
+ "audio": audio,
61
+ "sampling_rate": response["sampling_rate"],
62
+ **optional_kwargs,
63
+ }
64
+
65
+
30
66
  class CartesiaTTS:
31
67
  """The client for Cartesia's text-to-speech library.
32
68
 
33
69
  This client contains methods to interact with the Cartesia text-to-speech API.
34
- The API offers
70
+ The client can be used to retrieve available voices, compute new voice embeddings,
71
+ and generate speech from text.
72
+
73
+ The client also supports generating audio using a websocket for lower latency.
74
+ To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
35
75
 
36
76
  Examples:
37
77
 
@@ -55,18 +95,22 @@ class CartesiaTTS:
55
95
  ... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
56
96
  """
57
97
 
58
- def __init__(self, *, api_key: str = None):
98
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
59
99
  """
60
100
  Args:
61
101
  api_key: The API key to use for authorization.
62
102
  If not specified, the API key will be read from the environment variable
63
103
  `CARTESIA_API_KEY`.
104
+ experimental_ws_handle_interrupts: Whether to handle interrupts when generating
105
+ audio using the websocket. This is an experimental feature and may have bugs
106
+ or be deprecated in the future.
64
107
  """
65
108
  self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
66
109
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
67
110
  self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
68
111
  self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
69
112
  self.websocket = None
113
+ self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
70
114
  self.refresh_websocket()
71
115
 
72
116
  def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
@@ -100,9 +144,9 @@ class CartesiaTTS:
100
144
  >>> audio = client.generate(transcript="Hello world!", voice=embedding)
101
145
  """
102
146
  params = {"select": "id, name, description"} if skip_embeddings else None
103
- response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
147
+ response = httpx.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
104
148
 
105
- if response.status_code != 200:
149
+ if not response.is_success:
106
150
  raise ValueError(f"Failed to get voices. Error: {response.text}")
107
151
 
108
152
  voices = response.json()
@@ -134,20 +178,20 @@ class CartesiaTTS:
134
178
 
135
179
  if voice_id:
136
180
  url = f"{self._http_url()}/voices/embedding/{voice_id}"
137
- response = requests.get(url, headers=self.headers)
181
+ response = httpx.get(url, headers=self.headers)
138
182
  elif filepath:
139
183
  url = f"{self._http_url()}/voices/clone/clip"
140
184
  files = {"clip": open(filepath, "rb")}
141
185
  headers = self.headers.copy()
142
186
  # The default content type of JSON is incorrect for file uploads
143
187
  headers.pop("Content-Type")
144
- response = requests.post(url, headers=headers, files=files)
188
+ response = httpx.post(url, headers=headers, files=files)
145
189
  elif link:
146
190
  url = f"{self._http_url()}/voices/clone/url"
147
191
  params = {"link": link}
148
- response = requests.post(url, headers=self.headers, params=params)
192
+ response = httpx.post(url, headers=self.headers, params=params)
149
193
 
150
- if response.status_code != 200:
194
+ if not response.is_success:
151
195
  raise ValueError(
152
196
  f"Failed to clone voice. Status Code: {response.status_code}\n"
153
197
  f"Error: {response.text}"
@@ -167,8 +211,11 @@ class CartesiaTTS:
167
211
  """
168
212
  if self.websocket and not self._is_websocket_closed():
169
213
  self.websocket.close()
214
+ route = "audio/websocket"
215
+ if self.experimental_ws_handle_interrupts:
216
+ route = f"experimental/{route}"
170
217
  self.websocket = connect(
171
- f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
218
+ f"{self._ws_url()}/{route}?api_key={self.api_key}",
172
219
  close_timeout=None,
173
220
  )
174
221
 
@@ -189,6 +236,29 @@ class CartesiaTTS:
189
236
  if transcript.strip() == "":
190
237
  raise ValueError("`transcript` must be non empty")
191
238
 
239
+ def _generate_request_body(
240
+ self,
241
+ *,
242
+ transcript: str,
243
+ duration: int = None,
244
+ chunk_time: float = None,
245
+ voice: Embedding = None,
246
+ ) -> Dict[str, Any]:
247
+ """
248
+ Create the request body for a stream request.
249
+ Note that anything that's not provided will use a default if available or be filtered out otherwise.
250
+ """
251
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=voice)
252
+
253
+ optional_body = dict(
254
+ duration=duration,
255
+ chunk_time=chunk_time,
256
+ voice=voice,
257
+ )
258
+ body.update({k: v for k, v in optional_body.items() if v is not None})
259
+
260
+ return body
261
+
192
262
  def generate(
193
263
  self,
194
264
  *,
@@ -221,14 +291,9 @@ class CartesiaTTS:
221
291
  """
222
292
  self._check_inputs(transcript, duration, chunk_time)
223
293
 
224
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
225
-
226
- optional_body = dict(
227
- duration=duration,
228
- chunk_time=chunk_time,
229
- voice=voice,
294
+ body = self._generate_request_body(
295
+ transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
230
296
  )
231
- body.update({k: v for k, v in optional_body.items() if v is not None})
232
297
 
233
298
  if websocket:
234
299
  generator = self._generate_ws(body)
@@ -254,23 +319,14 @@ class CartesiaTTS:
254
319
  data=json.dumps(body),
255
320
  headers=self.headers,
256
321
  )
257
- if response.status_code != 200:
322
+ if not response.ok:
258
323
  raise ValueError(f"Failed to generate audio. {response.text}")
259
324
 
260
325
  buffer = ""
261
326
  for chunk_bytes in response.iter_content(chunk_size=None):
262
- buffer += chunk_bytes.decode("utf-8")
263
- while "{" in buffer and "}" in buffer:
264
- start_index = buffer.find("{")
265
- end_index = buffer.find("}", start_index)
266
- if start_index != -1 and end_index != -1:
267
- try:
268
- chunk_json = json.loads(buffer[start_index : end_index + 1])
269
- audio = base64.b64decode(chunk_json["data"])
270
- yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
271
- buffer = buffer[end_index + 1 :]
272
- except json.JSONDecodeError:
273
- break
327
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
328
+ for output in outputs:
329
+ yield output
274
330
 
275
331
  if buffer:
276
332
  try:
@@ -280,21 +336,41 @@ class CartesiaTTS:
280
336
  except json.JSONDecodeError:
281
337
  pass
282
338
 
283
- def _generate_ws(self, body: Dict[str, Any]):
339
+ def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
340
+ """Generate audio using the websocket connection.
341
+
342
+ Args:
343
+ body: The request body.
344
+ context_id: The context id for the request.
345
+ The context id must be globally unique for the duration this client exists.
346
+ If this is provided, the context id that is in the response will
347
+ also be returned as part of the dict. This is helpful for testing.
348
+ """
284
349
  if not self.websocket or self._is_websocket_closed():
285
350
  self.refresh_websocket()
286
351
 
287
- self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
352
+ include_context_id = bool(context_id)
353
+ if context_id is None:
354
+ context_id = uuid.uuid4().hex
355
+ self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
288
356
  try:
289
- response = json.loads(self.websocket.recv())
290
- while not response["done"]:
291
- audio = base64.b64decode(response["data"])
292
- # print("timing", time.perf_counter() - start)
293
- yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
294
-
357
+ while True:
295
358
  response = json.loads(self.websocket.recv())
296
- except Exception:
297
- raise RuntimeError(f"Failed to generate audio. {response}")
359
+ if response["done"]:
360
+ break
361
+
362
+ yield convert_response(response, include_context_id)
363
+
364
+ if self.experimental_ws_handle_interrupts:
365
+ self.websocket.send(json.dumps({"context_id": context_id}))
366
+ except GeneratorExit:
367
+ # The exit is only called when the generator is garbage collected.
368
+ # It may not be called directly after a break statement.
369
+ # However, the generator will be automatically cancelled on the next request.
370
+ if self.experimental_ws_handle_interrupts:
371
+ self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
372
+ except Exception as e:
373
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
298
374
 
299
375
  def _http_url(self):
300
376
  prefix = "http" if "localhost" in self.base_url else "https"
@@ -307,3 +383,143 @@ class CartesiaTTS:
307
383
  def __del__(self):
308
384
  if self.websocket.socket.fileno() > -1:
309
385
  self.websocket.close()
386
+
387
+
388
+ class AsyncCartesiaTTS(CartesiaTTS):
389
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
390
+ self.timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
391
+ self.connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
392
+ self._session = aiohttp.ClientSession(timeout=self.timeout, connector=self.connector)
393
+ super().__init__(
394
+ api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
395
+ )
396
+
397
+ def refresh_websocket(self):
398
+ pass # do not load the websocket for the client until asynchronously when it is needed
399
+
400
+ async def _async_refresh_websocket(self):
401
+ """Refresh the websocket connection."""
402
+ if self.websocket and not self._is_websocket_closed():
403
+ self.websocket.close()
404
+ route = "audio/websocket"
405
+ if self.experimental_ws_handle_interrupts:
406
+ route = f"experimental/{route}"
407
+ self.websocket = await self._session.ws_connect(
408
+ f"{self._ws_url()}/{route}?api_key={self.api_key}"
409
+ )
410
+
411
+ async def generate(
412
+ self,
413
+ *,
414
+ transcript: str,
415
+ duration: int = None,
416
+ chunk_time: float = None,
417
+ voice: Embedding = None,
418
+ stream: bool = False,
419
+ websocket: bool = True,
420
+ ) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
421
+ """Asynchronously generate audio from a transcript.
422
+ NOTE: This overrides the non-asynchronous generate method from the base class.
423
+ Args:
424
+ transcript: The text to generate audio for.
425
+ voice: The embedding to use for generating audio.
426
+ options: The options to use for generating audio. See :class:`GenerateOptions`.
427
+ Returns:
428
+ A dictionary containing the following:
429
+ * "audio": The audio as a 1D numpy array.
430
+ * "sampling_rate": The sampling rate of the audio.
431
+ """
432
+ body = self._generate_request_body(
433
+ transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
434
+ )
435
+
436
+ if websocket:
437
+ generator = self._generate_ws(body)
438
+ else:
439
+ generator = self._generate_http(body)
440
+
441
+ if stream:
442
+ return generator
443
+
444
+ chunks = []
445
+ sampling_rate = None
446
+ async for chunk in generator:
447
+ if sampling_rate is None:
448
+ sampling_rate = chunk["sampling_rate"]
449
+ chunks.append(chunk["audio"])
450
+
451
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
452
+
453
+ async def _generate_http(self, body: Dict[str, Any]):
454
+ async with self._session.post(
455
+ f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
456
+ ) as response:
457
+ if response.status < 200 or response.status >= 300:
458
+ raise ValueError(f"Failed to generate audio. {response.text}")
459
+
460
+ buffer = ""
461
+ async for chunk_bytes in response.content.iter_any():
462
+ buffer, outputs = update_buffer(buffer, chunk_bytes)
463
+ for output in outputs:
464
+ yield output
465
+
466
+ if buffer:
467
+ try:
468
+ chunk_json = json.loads(buffer)
469
+ audio = base64.b64decode(chunk_json["data"])
470
+ yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
471
+ except json.JSONDecodeError:
472
+ pass
473
+
474
+ async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
475
+ include_context_id = bool(context_id)
476
+ route = "audio/websocket"
477
+ if self.experimental_ws_handle_interrupts:
478
+ route = f"experimental/{route}"
479
+
480
+ if not self.websocket or self._is_websocket_closed():
481
+ await self._async_refresh_websocket()
482
+
483
+ ws = self.websocket
484
+ if context_id is None:
485
+ context_id = uuid.uuid4().hex
486
+ await ws.send_json({"data": body, "context_id": context_id})
487
+ try:
488
+ response = None
489
+ while True:
490
+ response = await ws.receive_json()
491
+ if response["done"]:
492
+ break
493
+
494
+ yield convert_response(response, include_context_id)
495
+
496
+ if self.experimental_ws_handle_interrupts:
497
+ await ws.send_json({"context_id": context_id})
498
+ except GeneratorExit:
499
+ # The exit is only called when the generator is garbage collected.
500
+ # It may not be called directly after a break statement.
501
+ # However, the generator will be automatically cancelled on the next request.
502
+ if self.experimental_ws_handle_interrupts:
503
+ await ws.send_json({"context_id": context_id, "action": "cancel"})
504
+ except Exception as e:
505
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
506
+
507
+ def _is_websocket_closed(self):
508
+ return self.websocket.closed
509
+
510
+ async def cleanup(self):
511
+ if self.websocket is not None and not self._is_websocket_closed():
512
+ await self.websocket.close()
513
+ if not self._session.closed:
514
+ await self._session.close()
515
+
516
+ def __del__(self):
517
+ try:
518
+ loop = asyncio.get_running_loop()
519
+ except RuntimeError:
520
+ loop = None
521
+
522
+ if loop is None:
523
+ asyncio.run(self.cleanup())
524
+ else:
525
+ loop.create_task(self.cleanup())
@@ -0,0 +1 @@
1
+ __version__ = "0.0.5rc1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.3
3
+ Version: 0.0.5rc1
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -10,8 +10,11 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
11
  Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
- Requires-Dist: websockets
13
+ Requires-Dist: aiohttp
14
+ Requires-Dist: httpx
15
+ Requires-Dist: pytest-asyncio
14
16
  Requires-Dist: requests
17
+ Requires-Dist: websockets
15
18
  Provides-Extra: dev
16
19
  Requires-Dist: pre-commit; extra == "dev"
17
20
  Requires-Dist: docformatter; extra == "dev"
@@ -21,6 +24,7 @@ Requires-Dist: flake8==7.0.0; extra == "dev"
21
24
  Requires-Dist: flake8-bugbear==24.2.6; extra == "dev"
22
25
  Requires-Dist: pytest>=8.0.2; extra == "dev"
23
26
  Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
27
+ Requires-Dist: twine; extra == "dev"
24
28
  Provides-Extra: all
25
29
  Requires-Dist: pre-commit; extra == "all"
26
30
  Requires-Dist: docformatter; extra == "all"
@@ -30,6 +34,7 @@ Requires-Dist: flake8==7.0.0; extra == "all"
30
34
  Requires-Dist: flake8-bugbear==24.2.6; extra == "all"
31
35
  Requires-Dist: pytest>=8.0.2; extra == "all"
32
36
  Requires-Dist: pytest-cov>=4.1.0; extra == "all"
37
+ Requires-Dist: twine; extra == "all"
33
38
 
34
39
 
35
40
  # Cartesia Python API Library
@@ -104,10 +109,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
104
109
  audio_data.seek(0)
105
110
 
106
111
  # Create an Audio object from the BytesIO data
107
- audio = Audio(audio_data, rate=output["sampling_rate"])
112
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
113
+
114
+ # Display the Audio object
115
+ display(audio)
116
+ ```
117
+
118
+ You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
119
+ ```python
120
+ from cartesia.tts import AsyncCartesiaTTS
121
+ from IPython.display import Audio
122
+ import io
123
+ import os
124
+
125
+ client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
126
+ voices = client.get_voices()
127
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
128
+ transcript = "Hello! Welcome to Cartesia"
129
+
130
+ # Create a BytesIO object to store the audio data
131
+ audio_data = io.BytesIO()
132
+
133
+ # Generate and stream audio
134
+ async for output in client.generate(transcript=transcript, voice=voice, stream=True):
135
+ buffer = output["audio"]
136
+ audio_data.write(buffer)
137
+
138
+ # Set the cursor position to the beginning of the BytesIO object
139
+ audio_data.seek(0)
140
+
141
+ # Create an Audio object from the BytesIO data
142
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
108
143
 
109
144
  # Display the Audio object
110
145
  display(audio)
111
146
  ```
112
147
 
113
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
148
+ To avoid storing your API key in the source code, we recommend doing one of the following:
149
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
150
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -1,5 +1,8 @@
1
- websockets
1
+ aiohttp
2
+ httpx
3
+ pytest-asyncio
2
4
  requests
5
+ websockets
3
6
 
4
7
  [all]
5
8
  pre-commit
@@ -10,6 +13,7 @@ flake8==7.0.0
10
13
  flake8-bugbear==24.2.6
11
14
  pytest>=8.0.2
12
15
  pytest-cov>=4.1.0
16
+ twine
13
17
 
14
18
  [dev]
15
19
  pre-commit
@@ -20,3 +24,4 @@ flake8==7.0.0
20
24
  flake8-bugbear==24.2.6
21
25
  pytest>=8.0.2
22
26
  pytest-cov>=4.1.0
27
+ twine
@@ -78,7 +78,8 @@ class UploadCommand(Command):
78
78
  """Support setup.py upload."""
79
79
 
80
80
  description = "Build and publish the package."
81
- user_options = []
81
+ user_options = [("skip-upload", "u", "skip git tagging and pypi upload")]
82
+ boolean_options = ["skip-upload"]
82
83
 
83
84
  @staticmethod
84
85
  def status(s):
@@ -86,21 +87,26 @@ class UploadCommand(Command):
86
87
  print("\033[1m{0}\033[0m".format(s))
87
88
 
88
89
  def initialize_options(self):
89
- pass
90
+ self.skip_upload = False
90
91
 
91
92
  def finalize_options(self):
92
- pass
93
+ self.skip_upload = bool(self.skip_upload)
93
94
 
94
95
  def run(self):
95
96
  try:
96
97
  self.status("Removing previous builds…")
97
98
  rmtree(os.path.join(here, "dist"))
99
+ rmtree(os.path.join(here, "build"))
98
100
  except OSError:
99
101
  pass
100
102
 
101
103
  self.status("Building Source and Wheel (universal) distribution…")
102
104
  os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
103
105
 
106
+ if self.skip_upload:
107
+ self.status("Skipping git tagging and pypi upload")
108
+ sys.exit()
109
+
104
110
  self.status("Uploading the package to PyPI via Twine…")
105
111
  os.system("twine upload dist/*")
106
112
 
@@ -116,6 +122,9 @@ class BumpVersionCommand(Command):
116
122
  To use: python setup.py bumpversion -v <version>
117
123
 
118
124
  This command will push the new version directly and tag it.
125
+
126
+ Usage:
127
+ python setup.py bumpversion --version=1.0.1
119
128
  """
120
129
 
121
130
  description = "Installs the foo."
@@ -130,6 +139,11 @@ class BumpVersionCommand(Command):
130
139
 
131
140
  def initialize_options(self):
132
141
  self.version = None
142
+ self.base_branch = None
143
+ self.version_branch = None
144
+ self.updated_files = [
145
+ "cartesia/version.py",
146
+ ]
133
147
 
134
148
  def finalize_options(self):
135
149
  # This package cannot be imported at top level because it
@@ -147,14 +161,18 @@ class BumpVersionCommand(Command):
147
161
  )
148
162
 
149
163
  def _undo(self):
150
- os.system(f"git restore --staged {PACKAGE_DIR}/__init__.py")
151
- os.system(f"git checkout -- {PACKAGE_DIR}/__init__.py")
164
+ os.system(f"git restore --staged {' '.join(self.updated_files)}")
165
+ os.system(f"git checkout -- {' '.join(self.updated_files)}")
166
+
167
+ # Return to the original branch
168
+ os.system(f"git checkout {self.base_branch}")
169
+ os.system(f"git branch -D {self.version_branch}")
152
170
 
153
171
  def run(self):
154
172
  current_version = about["__version__"]
155
173
 
156
174
  self.status("Checking current branch is 'main'")
157
- current_branch = get_git_branch()
175
+ self.base_branch = current_branch = get_git_branch()
158
176
  if current_branch != "main":
159
177
  raise RuntimeError(
160
178
  "You can only bump the version from the 'main' branch. "
@@ -174,33 +192,43 @@ class BumpVersionCommand(Command):
174
192
 
175
193
  # TODO: Add check to see if all tests are passing on main.
176
194
 
195
+ # Checkout new branch
196
+ self.version_branch = f"bumpversion/v{self.version}"
197
+ self.status(f"Create branch '{self.version_branch}'")
198
+ err_code = os.system(f"git checkout -b {self.version_branch}")
199
+ if err_code != 0:
200
+ raise RuntimeError("Failed to create branch.")
201
+
177
202
  # Change the version in __init__.py
178
203
  self.status(f"Updating version {current_version} -> {self.version}")
179
204
  update_version(self.version)
180
- if current_version != self.version:
181
- self._undo()
182
- raise RuntimeError("Failed to update version.")
205
+ # if current_version != self.version:
206
+ # self._undo()
207
+ # raise RuntimeError("Failed to update version.")
183
208
 
184
- self.status(f"Adding {PACKAGE_DIR}/__init__.py to git")
185
- err_code = os.system(f"git add {PACKAGE_DIR}/__init__.py")
209
+ self.status(f"Adding {', '.join(self.updated_files)} to git")
210
+ err_code = os.system(f"git add {' '.join(self.updated_files)}")
186
211
  if err_code != 0:
187
212
  self._undo()
188
- raise RuntimeError("Failed to add file to git.")
213
+ raise RuntimeError("Failed to add files to git.")
189
214
 
190
215
  # Commit the file with a message '[bumpversion] v<version>'.
191
216
  self.status(f"Commit with message '[bumpversion] v{self.version}'")
192
- err_code = os.system("git commit -m '[bumpversion] v{}'".format(current_version))
217
+ err_code = os.system("git commit -m '[bumpversion] v{}'".format(self.version))
193
218
  if err_code != 0:
194
219
  self._undo()
195
220
  raise RuntimeError("Failed to commit file to git.")
196
221
 
197
222
  # Push the commit to origin.
198
- # self.status("Pushing commit to origin")
199
- # err_code = os.system("git push")
200
- # if err_code != 0:
201
- # # TODO: undo the commit automatically.
202
- # raise RuntimeError("Failed to push commit to origin.")
223
+ self.status(f"Pushing commit to origin/{self.version_branch}")
224
+ err_code = os.system(f"git push --force --set-upstream origin {self.version_branch}")
225
+ if err_code != 0:
226
+ # TODO: undo the commit automatically.
227
+ self._undo()
228
+ raise RuntimeError("Failed to push commit to origin.")
203
229
 
230
+ os.system(f"git checkout {self.base_branch}")
231
+ os.system(f"git branch -D {self.version_branch}")
204
232
  sys.exit()
205
233
 
206
234
 
@@ -6,11 +6,14 @@ but rather for general correctness.
6
6
  """
7
7
 
8
8
  import os
9
- from typing import Dict, Generator
9
+ import sys
10
+ import uuid
11
+ from typing import AsyncGenerator, Dict, Generator, List
10
12
 
11
13
  import pytest
12
14
 
13
- from cartesia.tts import CartesiaTTS, VoiceMetadata
15
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
16
+ from cartesia.tts import DEFAULT_MODEL_ID, AsyncCartesiaTTS, CartesiaTTS, VoiceMetadata
14
17
 
15
18
  SAMPLE_VOICE = "Milo"
16
19
 
@@ -21,9 +24,24 @@ class _Resources:
21
24
  self.voices = voices
22
25
 
23
26
 
27
+ def create_client():
28
+ return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
29
+
30
+
31
+ def create_async_client():
32
+ return AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
33
+
34
+
24
35
  @pytest.fixture(scope="session")
25
36
  def client():
26
- return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
37
+ return create_client()
38
+
39
+
40
+ @pytest.fixture(scope="session")
41
+ def client_with_ws_interrupt():
42
+ return CartesiaTTS(
43
+ api_key=os.environ.get("CARTESIA_API_KEY"), experimental_ws_handle_interrupts=True
44
+ )
27
45
 
28
46
 
29
47
  @pytest.fixture(scope="session")
@@ -93,6 +111,80 @@ def test_generate_stream(resources: _Resources, websocket: bool):
93
111
  assert isinstance(output["sampling_rate"], int)
94
112
 
95
113
 
114
+ @pytest.mark.parametrize("websocket", [True, False])
115
+ @pytest.mark.asyncio
116
+ async def test_async_generate(resources: _Resources, websocket: bool):
117
+ voices = resources.voices
118
+ embedding = voices[SAMPLE_VOICE]["embedding"]
119
+ transcript = "Hello, world!"
120
+
121
+ async_client = create_async_client()
122
+ output = await async_client.generate(
123
+ transcript=transcript, voice=embedding, websocket=websocket
124
+ )
125
+
126
+ assert output.keys() == {"audio", "sampling_rate"}
127
+ assert isinstance(output["audio"], bytes)
128
+ assert isinstance(output["sampling_rate"], int)
129
+
130
+
131
+ @pytest.mark.parametrize("websocket", [True, False])
132
+ @pytest.mark.asyncio
133
+ async def test_async_generate_stream(resources: _Resources, websocket: bool):
134
+ voices = resources.voices
135
+ embedding = voices[SAMPLE_VOICE]["embedding"]
136
+ transcript = "Hello, world!"
137
+
138
+ async_client = create_async_client()
139
+
140
+ generator = await async_client.generate(transcript=transcript, voice=embedding, stream=True)
141
+ assert isinstance(generator, AsyncGenerator)
142
+
143
+ async for output in generator:
144
+ assert output.keys() == {"audio", "sampling_rate"}
145
+ assert isinstance(output["audio"], bytes)
146
+ assert isinstance(output["sampling_rate"], int)
147
+
148
+
149
+ @pytest.mark.parametrize(
150
+ "actions",
151
+ [
152
+ ["cancel-5", None],
153
+ ["cancel-5", "cancel-1", None],
154
+ [None, "cancel-3", None],
155
+ [None, "cancel-1", "cancel-2"],
156
+ ],
157
+ )
158
+ def test_generate_stream_interrupt(
159
+ client_with_ws_interrupt: CartesiaTTS, resources: _Resources, actions: List[str]
160
+ ):
161
+ client = client_with_ws_interrupt
162
+ voices = resources.voices
163
+ embedding = voices[SAMPLE_VOICE]["embedding"]
164
+ transcript = "Hello, world!"
165
+
166
+ context_ids = [f"test-{uuid.uuid4().hex[:6]}" for _ in range(len(actions))]
167
+
168
+ for context_id, action in zip(context_ids, actions):
169
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=embedding)
170
+
171
+ # Parse actions to see what we should expect.
172
+ if action is None:
173
+ num_turns = None
174
+ elif "cancel" in action:
175
+ num_turns = int(action.split("-")[1])
176
+
177
+ generator = client._generate_ws(body, context_id=context_id)
178
+ for idx, response in enumerate(generator):
179
+ assert response.keys() == {"audio", "sampling_rate", "context_id"}
180
+ assert response["context_id"] == context_id, (
181
+ f"Context ID from response ({response['context_id']}) does not match "
182
+ f"the expected context ID ({context_id})"
183
+ )
184
+ if idx + 1 == num_turns:
185
+ break
186
+
187
+
96
188
  @pytest.mark.parametrize("chunk_time", [0.05, 0.6])
97
189
  def test_check_inputs_invalid_chunk_time(client: CartesiaTTS, chunk_time):
98
190
  with pytest.raises(ValueError, match="`chunk_time` must be between 0.1 and 0.5"):
@@ -1 +0,0 @@
1
- __version__ = "0.0.3"
File without changes
File without changes