cartesia 0.0.3__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +2 -2
- cartesia/tts.py +429 -65
- cartesia/utils.py +65 -0
- cartesia/version.py +1 -1
- cartesia-0.0.5.dist-info/METADATA +187 -0
- cartesia-0.0.5.dist-info/RECORD +8 -0
- {cartesia-0.0.3.dist-info → cartesia-0.0.5.dist-info}/WHEEL +1 -1
- cartesia-0.0.3.dist-info/METADATA +0 -113
- cartesia-0.0.3.dist-info/RECORD +0 -7
- {cartesia-0.0.3.dist-info → cartesia-0.0.5.dist-info}/top_level.txt +0 -0
cartesia/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
from cartesia.tts import CartesiaTTS
|
1
|
+
from cartesia.tts import AsyncCartesiaTTS, CartesiaTTS
|
2
2
|
|
3
|
-
__all__ = ["CartesiaTTS"]
|
3
|
+
__all__ = ["CartesiaTTS", "AsyncCartesiaTTS"]
|
cartesia/tts.py
CHANGED
@@ -1,16 +1,29 @@
|
|
1
|
+
import asyncio
|
1
2
|
import base64
|
2
3
|
import json
|
3
4
|
import os
|
4
5
|
import uuid
|
5
|
-
from
|
6
|
+
from types import TracebackType
|
7
|
+
from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
|
6
8
|
|
9
|
+
import aiohttp
|
10
|
+
import httpx
|
11
|
+
import logging
|
7
12
|
import requests
|
8
13
|
from websockets.sync.client import connect
|
9
14
|
|
15
|
+
from cartesia.utils import retry_on_connection_error, retry_on_connection_error_async
|
16
|
+
|
10
17
|
DEFAULT_MODEL_ID = "genial-planet-1346"
|
11
18
|
DEFAULT_BASE_URL = "api.cartesia.ai"
|
12
19
|
DEFAULT_API_VERSION = "v0"
|
20
|
+
DEFAULT_TIMEOUT = 30 # seconds
|
21
|
+
DEFAULT_NUM_CONNECTIONS = 10 # connections per client
|
22
|
+
|
23
|
+
BACKOFF_FACTOR = 1
|
24
|
+
MAX_RETRIES = 3
|
13
25
|
|
26
|
+
logger = logging.getLogger(__name__)
|
14
27
|
|
15
28
|
class AudioOutput(TypedDict):
|
16
29
|
audio: bytes
|
@@ -27,14 +40,48 @@ class VoiceMetadata(TypedDict):
|
|
27
40
|
embedding: Optional[Embedding]
|
28
41
|
|
29
42
|
|
43
|
+
def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
|
44
|
+
buffer += chunk_bytes.decode("utf-8")
|
45
|
+
outputs = []
|
46
|
+
while "{" in buffer and "}" in buffer:
|
47
|
+
start_index = buffer.find("{")
|
48
|
+
end_index = buffer.find("}", start_index)
|
49
|
+
if start_index != -1 and end_index != -1:
|
50
|
+
try:
|
51
|
+
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
52
|
+
audio = base64.b64decode(chunk_json["data"])
|
53
|
+
outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
|
54
|
+
buffer = buffer[end_index + 1 :]
|
55
|
+
except json.JSONDecodeError:
|
56
|
+
break
|
57
|
+
return buffer, outputs
|
58
|
+
|
59
|
+
|
60
|
+
def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
|
61
|
+
audio = base64.b64decode(response["data"])
|
62
|
+
|
63
|
+
optional_kwargs = {}
|
64
|
+
if include_context_id:
|
65
|
+
optional_kwargs["context_id"] = response["context_id"]
|
66
|
+
|
67
|
+
return {
|
68
|
+
"audio": audio,
|
69
|
+
"sampling_rate": response["sampling_rate"],
|
70
|
+
**optional_kwargs,
|
71
|
+
}
|
72
|
+
|
73
|
+
|
30
74
|
class CartesiaTTS:
|
31
75
|
"""The client for Cartesia's text-to-speech library.
|
32
76
|
|
33
77
|
This client contains methods to interact with the Cartesia text-to-speech API.
|
34
|
-
The
|
78
|
+
The client can be used to retrieve available voices, compute new voice embeddings,
|
79
|
+
and generate speech from text.
|
35
80
|
|
36
|
-
|
81
|
+
The client also supports generating audio using a websocket for lower latency.
|
82
|
+
To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
|
37
83
|
|
84
|
+
Examples:
|
38
85
|
>>> client = CartesiaTTS()
|
39
86
|
|
40
87
|
# Load available voices and their metadata (excluding the embeddings).
|
@@ -55,19 +102,21 @@ class CartesiaTTS:
|
|
55
102
|
... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
|
56
103
|
"""
|
57
104
|
|
58
|
-
def __init__(self, *, api_key: str = None):
|
59
|
-
"""
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
105
|
+
def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
|
106
|
+
"""Args:
|
107
|
+
api_key: The API key to use for authorization.
|
108
|
+
If not specified, the API key will be read from the environment variable
|
109
|
+
`CARTESIA_API_KEY`.
|
110
|
+
experimental_ws_handle_interrupts: Whether to handle interrupts when generating
|
111
|
+
audio using the websocket. This is an experimental feature and may have bugs
|
112
|
+
or be deprecated in the future.
|
64
113
|
"""
|
65
114
|
self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
|
66
115
|
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
67
116
|
self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
|
68
117
|
self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
|
69
118
|
self.websocket = None
|
70
|
-
self.
|
119
|
+
self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
|
71
120
|
|
72
121
|
def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
|
73
122
|
"""Returns a mapping from voice name -> voice metadata.
|
@@ -100,9 +149,14 @@ class CartesiaTTS:
|
|
100
149
|
>>> audio = client.generate(transcript="Hello world!", voice=embedding)
|
101
150
|
"""
|
102
151
|
params = {"select": "id, name, description"} if skip_embeddings else None
|
103
|
-
response =
|
152
|
+
response = httpx.get(
|
153
|
+
f"{self._http_url()}/voices",
|
154
|
+
headers=self.headers,
|
155
|
+
params=params,
|
156
|
+
timeout=DEFAULT_TIMEOUT,
|
157
|
+
)
|
104
158
|
|
105
|
-
if response.
|
159
|
+
if not response.is_success:
|
106
160
|
raise ValueError(f"Failed to get voices. Error: {response.text}")
|
107
161
|
|
108
162
|
voices = response.json()
|
@@ -112,6 +166,7 @@ class CartesiaTTS:
|
|
112
166
|
voice["embedding"] = json.loads(voice["embedding"])
|
113
167
|
return {voice["name"]: voice for voice in voices}
|
114
168
|
|
169
|
+
@retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
|
115
170
|
def get_voice_embedding(
|
116
171
|
self, *, voice_id: str = None, filepath: str = None, link: str = None
|
117
172
|
) -> Embedding:
|
@@ -134,20 +189,20 @@ class CartesiaTTS:
|
|
134
189
|
|
135
190
|
if voice_id:
|
136
191
|
url = f"{self._http_url()}/voices/embedding/{voice_id}"
|
137
|
-
response =
|
192
|
+
response = httpx.get(url, headers=self.headers, timeout=DEFAULT_TIMEOUT)
|
138
193
|
elif filepath:
|
139
194
|
url = f"{self._http_url()}/voices/clone/clip"
|
140
195
|
files = {"clip": open(filepath, "rb")}
|
141
196
|
headers = self.headers.copy()
|
142
197
|
# The default content type of JSON is incorrect for file uploads
|
143
198
|
headers.pop("Content-Type")
|
144
|
-
response =
|
199
|
+
response = httpx.post(url, headers=headers, files=files, timeout=DEFAULT_TIMEOUT)
|
145
200
|
elif link:
|
146
201
|
url = f"{self._http_url()}/voices/clone/url"
|
147
202
|
params = {"link": link}
|
148
|
-
response =
|
203
|
+
response = httpx.post(url, headers=self.headers, params=params, timeout=DEFAULT_TIMEOUT)
|
149
204
|
|
150
|
-
if response.
|
205
|
+
if not response.is_success:
|
151
206
|
raise ValueError(
|
152
207
|
f"Failed to clone voice. Status Code: {response.status_code}\n"
|
153
208
|
f"Error: {response.text}"
|
@@ -165,12 +220,11 @@ class CartesiaTTS:
|
|
165
220
|
Note:
|
166
221
|
The connection is synchronous.
|
167
222
|
"""
|
168
|
-
if self.websocket
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
)
|
223
|
+
if self.websocket is None or self._is_websocket_closed():
|
224
|
+
route = "audio/websocket"
|
225
|
+
if self.experimental_ws_handle_interrupts:
|
226
|
+
route = f"experimental/{route}"
|
227
|
+
self.websocket = connect(f"{self._ws_url()}/{route}?api_key={self.api_key}")
|
174
228
|
|
175
229
|
def _is_websocket_closed(self):
|
176
230
|
return self.websocket.socket.fileno() == -1
|
@@ -189,29 +243,53 @@ class CartesiaTTS:
|
|
189
243
|
if transcript.strip() == "":
|
190
244
|
raise ValueError("`transcript` must be non empty")
|
191
245
|
|
246
|
+
def _generate_request_body(
|
247
|
+
self,
|
248
|
+
*,
|
249
|
+
transcript: str,
|
250
|
+
voice: Embedding,
|
251
|
+
model_id: str,
|
252
|
+
duration: int = None,
|
253
|
+
chunk_time: float = None,
|
254
|
+
) -> Dict[str, Any]:
|
255
|
+
"""Create the request body for a stream request.
|
256
|
+
|
257
|
+
Note that anything that's not provided will use a default if available or be
|
258
|
+
filtered out otherwise.
|
259
|
+
"""
|
260
|
+
body = dict(transcript=transcript, model_id=model_id, voice=voice)
|
261
|
+
|
262
|
+
optional_body = dict(
|
263
|
+
duration=duration,
|
264
|
+
chunk_time=chunk_time,
|
265
|
+
)
|
266
|
+
body.update({k: v for k, v in optional_body.items() if v is not None})
|
267
|
+
|
268
|
+
return body
|
269
|
+
|
192
270
|
def generate(
|
193
271
|
self,
|
194
272
|
*,
|
195
273
|
transcript: str,
|
274
|
+
voice: Embedding,
|
275
|
+
model_id: str = DEFAULT_MODEL_ID,
|
196
276
|
duration: int = None,
|
197
277
|
chunk_time: float = None,
|
198
|
-
voice: Embedding = None,
|
199
278
|
stream: bool = False,
|
200
279
|
websocket: bool = True,
|
201
280
|
) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
|
202
281
|
"""Generate audio from a transcript.
|
203
282
|
|
204
283
|
Args:
|
205
|
-
transcript: The text to generate audio for.
|
206
|
-
|
207
|
-
|
284
|
+
transcript (str): The text to generate audio for.
|
285
|
+
voice (Embedding (List[float])): The voice to use for generating audio.
|
286
|
+
duration (int, optional): The maximum duration of the audio in seconds.
|
287
|
+
chunk_time (float, optional): How long each audio segment should be in seconds.
|
208
288
|
This should not need to be adjusted.
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
websocket: Whether to use a websocket for streaming audio.
|
214
|
-
Using the websocket reduces latency by pre-poning the handshake.
|
289
|
+
stream (bool, optional): Whether to stream the audio or not.
|
290
|
+
If True this function returns a generator. False by default.
|
291
|
+
websocket (bool, optional): Whether to use a websocket for streaming audio.
|
292
|
+
Using the websocket reduces latency by pre-poning the handshake. True by default.
|
215
293
|
|
216
294
|
Returns:
|
217
295
|
A generator if `stream` is True, otherwise a dictionary.
|
@@ -221,19 +299,18 @@ class CartesiaTTS:
|
|
221
299
|
"""
|
222
300
|
self._check_inputs(transcript, duration, chunk_time)
|
223
301
|
|
224
|
-
body =
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
302
|
+
body = self._generate_request_body(
|
303
|
+
transcript=transcript,
|
304
|
+
voice=voice,
|
305
|
+
model_id=model_id,
|
306
|
+
duration=duration,
|
307
|
+
chunk_time=chunk_time
|
230
308
|
)
|
231
|
-
body.update({k: v for k, v in optional_body.items() if v is not None})
|
232
309
|
|
233
310
|
if websocket:
|
234
311
|
generator = self._generate_ws(body)
|
235
312
|
else:
|
236
|
-
generator = self.
|
313
|
+
generator = self._generate_http_wrapper(body)
|
237
314
|
|
238
315
|
if stream:
|
239
316
|
return generator
|
@@ -247,30 +324,32 @@ class CartesiaTTS:
|
|
247
324
|
|
248
325
|
return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
|
249
326
|
|
327
|
+
@retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
|
328
|
+
def _generate_http_wrapper(self, body: Dict[str, Any]):
|
329
|
+
"""Need to wrap the http generator in a function for the retry decorator to work."""
|
330
|
+
try:
|
331
|
+
for chunk in self._generate_http(body):
|
332
|
+
yield chunk
|
333
|
+
except Exception as e:
|
334
|
+
logger.error(f"Failed to generate audio. {e}")
|
335
|
+
raise e
|
336
|
+
|
250
337
|
def _generate_http(self, body: Dict[str, Any]):
|
251
338
|
response = requests.post(
|
252
339
|
f"{self._http_url()}/audio/stream",
|
253
340
|
stream=True,
|
254
341
|
data=json.dumps(body),
|
255
342
|
headers=self.headers,
|
343
|
+
timeout=(DEFAULT_TIMEOUT, DEFAULT_TIMEOUT),
|
256
344
|
)
|
257
|
-
if response.
|
345
|
+
if not response.ok:
|
258
346
|
raise ValueError(f"Failed to generate audio. {response.text}")
|
259
347
|
|
260
348
|
buffer = ""
|
261
349
|
for chunk_bytes in response.iter_content(chunk_size=None):
|
262
|
-
buffer
|
263
|
-
|
264
|
-
|
265
|
-
end_index = buffer.find("}", start_index)
|
266
|
-
if start_index != -1 and end_index != -1:
|
267
|
-
try:
|
268
|
-
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
269
|
-
audio = base64.b64decode(chunk_json["data"])
|
270
|
-
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
271
|
-
buffer = buffer[end_index + 1 :]
|
272
|
-
except json.JSONDecodeError:
|
273
|
-
break
|
350
|
+
buffer, outputs = update_buffer(buffer, chunk_bytes)
|
351
|
+
for output in outputs:
|
352
|
+
yield output
|
274
353
|
|
275
354
|
if buffer:
|
276
355
|
try:
|
@@ -280,21 +359,77 @@ class CartesiaTTS:
|
|
280
359
|
except json.JSONDecodeError:
|
281
360
|
pass
|
282
361
|
|
283
|
-
def _generate_ws(self, body: Dict[str, Any]):
|
362
|
+
def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
|
363
|
+
"""Generate audio using the websocket connection.
|
364
|
+
|
365
|
+
Args:
|
366
|
+
body: The request body.
|
367
|
+
context_id: The context id for the request.
|
368
|
+
The context id must be globally unique for the duration this client exists.
|
369
|
+
If this is provided, the context id that is in the response will
|
370
|
+
also be returned as part of the dict. This is helpful for testing.
|
371
|
+
"""
|
284
372
|
if not self.websocket or self._is_websocket_closed():
|
285
373
|
self.refresh_websocket()
|
286
374
|
|
287
|
-
|
375
|
+
include_context_id = bool(context_id)
|
376
|
+
if context_id is None:
|
377
|
+
context_id = uuid.uuid4().hex
|
378
|
+
self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
|
288
379
|
try:
|
289
|
-
|
290
|
-
while not response["done"]:
|
291
|
-
audio = base64.b64decode(response["data"])
|
292
|
-
# print("timing", time.perf_counter() - start)
|
293
|
-
yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
|
294
|
-
|
380
|
+
while True:
|
295
381
|
response = json.loads(self.websocket.recv())
|
296
|
-
|
297
|
-
|
382
|
+
if response["done"]:
|
383
|
+
break
|
384
|
+
|
385
|
+
yield convert_response(response, include_context_id)
|
386
|
+
|
387
|
+
if self.experimental_ws_handle_interrupts:
|
388
|
+
self.websocket.send(json.dumps({"context_id": context_id}))
|
389
|
+
except GeneratorExit:
|
390
|
+
# The exit is only called when the generator is garbage collected.
|
391
|
+
# It may not be called directly after a break statement.
|
392
|
+
# However, the generator will be automatically cancelled on the next request.
|
393
|
+
if self.experimental_ws_handle_interrupts:
|
394
|
+
self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
|
395
|
+
except Exception as e:
|
396
|
+
# Close the websocket connection if an error occurs.
|
397
|
+
if self.websocket and not self._is_websocket_closed():
|
398
|
+
self.websocket.close()
|
399
|
+
raise RuntimeError(f"Failed to generate audio. {response}") from e
|
400
|
+
finally:
|
401
|
+
# Ensure the websocket is ultimately closed.
|
402
|
+
if self.websocket and not self._is_websocket_closed():
|
403
|
+
self.websocket.close()
|
404
|
+
|
405
|
+
@retry_on_connection_error(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
|
406
|
+
def transcribe(self, raw_audio: Union[bytes, str]) -> str:
|
407
|
+
raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
|
408
|
+
response = httpx.post(
|
409
|
+
f"{self._http_url()}/audio/transcriptions",
|
410
|
+
headers=headers,
|
411
|
+
files={"clip": ("input.wav", raw_audio_bytes)},
|
412
|
+
timeout=DEFAULT_TIMEOUT,
|
413
|
+
)
|
414
|
+
|
415
|
+
if not response.is_success:
|
416
|
+
raise ValueError(f"Failed to transcribe audio. Error: {response.text()}")
|
417
|
+
|
418
|
+
transcript = response.json()
|
419
|
+
return transcript["text"]
|
420
|
+
|
421
|
+
|
422
|
+
def prepare_audio_and_headers(
|
423
|
+
self, raw_audio: Union[bytes, str]
|
424
|
+
) -> Tuple[bytes, Dict[str, Any]]:
|
425
|
+
if isinstance(raw_audio, str):
|
426
|
+
with open(raw_audio, "rb") as f:
|
427
|
+
raw_audio_bytes = f.read()
|
428
|
+
else:
|
429
|
+
raw_audio_bytes = raw_audio
|
430
|
+
# application/json is not the right content type for this request
|
431
|
+
headers = {k: v for k, v in self.headers.items() if k != "Content-Type"}
|
432
|
+
return raw_audio_bytes, headers
|
298
433
|
|
299
434
|
def _http_url(self):
|
300
435
|
prefix = "http" if "localhost" in self.base_url else "https"
|
@@ -304,6 +439,235 @@ class CartesiaTTS:
|
|
304
439
|
prefix = "ws" if "localhost" in self.base_url else "wss"
|
305
440
|
return f"{prefix}://{self.base_url}/{self.api_version}"
|
306
441
|
|
307
|
-
def
|
308
|
-
if self.websocket.
|
442
|
+
def close(self):
|
443
|
+
if self.websocket and not self._is_websocket_closed():
|
309
444
|
self.websocket.close()
|
445
|
+
|
446
|
+
def __del__(self):
|
447
|
+
self.close()
|
448
|
+
|
449
|
+
def __enter__(self):
|
450
|
+
self.refresh_websocket()
|
451
|
+
return self
|
452
|
+
|
453
|
+
def __exit__(
|
454
|
+
self,
|
455
|
+
exc_type: Union[type, None],
|
456
|
+
exc: Union[BaseException, None],
|
457
|
+
exc_tb: Union[TracebackType, None],
|
458
|
+
):
|
459
|
+
self.close()
|
460
|
+
|
461
|
+
|
462
|
+
class AsyncCartesiaTTS(CartesiaTTS):
|
463
|
+
def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
|
464
|
+
self._session = None
|
465
|
+
self._loop = None
|
466
|
+
super().__init__(
|
467
|
+
api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
|
468
|
+
)
|
469
|
+
|
470
|
+
async def _get_session(self):
|
471
|
+
current_loop = asyncio.get_event_loop()
|
472
|
+
if self._loop is not current_loop:
|
473
|
+
# If the loop has changed, close the session and create a new one.
|
474
|
+
await self.close()
|
475
|
+
if self._session is None or self._session.closed:
|
476
|
+
timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
|
477
|
+
connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
|
478
|
+
self._session = aiohttp.ClientSession(
|
479
|
+
timeout=timeout, connector=connector
|
480
|
+
)
|
481
|
+
self._loop = current_loop
|
482
|
+
return self._session
|
483
|
+
|
484
|
+
async def refresh_websocket(self):
|
485
|
+
"""Refresh the websocket connection."""
|
486
|
+
if self.websocket is None or self._is_websocket_closed():
|
487
|
+
route = "audio/websocket"
|
488
|
+
if self.experimental_ws_handle_interrupts:
|
489
|
+
route = f"experimental/{route}"
|
490
|
+
session = await self._get_session()
|
491
|
+
self.websocket = await session.ws_connect(
|
492
|
+
f"{self._ws_url()}/{route}?api_key={self.api_key}"
|
493
|
+
)
|
494
|
+
|
495
|
+
def _is_websocket_closed(self):
|
496
|
+
return self.websocket.closed
|
497
|
+
|
498
|
+
async def close(self):
|
499
|
+
"""This method closes the websocket and the session.
|
500
|
+
|
501
|
+
It is *strongly* recommended to call this method when you are done using the client.
|
502
|
+
"""
|
503
|
+
if self.websocket is not None and not self._is_websocket_closed():
|
504
|
+
await self.websocket.close()
|
505
|
+
if self._session is not None and not self._session.closed:
|
506
|
+
await self._session.close()
|
507
|
+
|
508
|
+
async def generate(
|
509
|
+
self,
|
510
|
+
*,
|
511
|
+
transcript: str,
|
512
|
+
voice: Embedding,
|
513
|
+
model_id: str = DEFAULT_MODEL_ID,
|
514
|
+
duration: int = None,
|
515
|
+
chunk_time: float = None,
|
516
|
+
stream: bool = False,
|
517
|
+
websocket: bool = True,
|
518
|
+
) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
|
519
|
+
"""Asynchronously generate audio from a transcript.
|
520
|
+
NOTE: This overrides the non-asynchronous generate method from the base class.
|
521
|
+
|
522
|
+
Args:
|
523
|
+
transcript (str): The text to generate audio for.
|
524
|
+
voice (Embedding (List[float])): The voice to use for generating audio.
|
525
|
+
duration (int, optional): The maximum duration of the audio in seconds.
|
526
|
+
chunk_time (float, optional): How long each audio segment should be in seconds.
|
527
|
+
This should not need to be adjusted.
|
528
|
+
stream (bool, optional): Whether to stream the audio or not.
|
529
|
+
If True this function returns a generator. False by default.
|
530
|
+
websocket (bool, optional): Whether to use a websocket for streaming audio.
|
531
|
+
Using the websocket reduces latency by pre-poning the handshake. True by default.
|
532
|
+
|
533
|
+
Returns:
|
534
|
+
A generator if `stream` is True, otherwise a dictionary.
|
535
|
+
Dictionary from both generator and non-generator return types have the following keys:
|
536
|
+
* "audio": The audio as a bytes buffer.
|
537
|
+
* "sampling_rate": The sampling rate of the audio.
|
538
|
+
"""
|
539
|
+
self._check_inputs(transcript, duration, chunk_time)
|
540
|
+
|
541
|
+
body = self._generate_request_body(
|
542
|
+
transcript=transcript,
|
543
|
+
voice=voice,
|
544
|
+
model_id=model_id,
|
545
|
+
duration=duration,
|
546
|
+
chunk_time=chunk_time
|
547
|
+
)
|
548
|
+
|
549
|
+
if websocket:
|
550
|
+
generator = self._generate_ws(body)
|
551
|
+
else:
|
552
|
+
generator = self._generate_http_wrapper(body)
|
553
|
+
|
554
|
+
if stream:
|
555
|
+
return generator
|
556
|
+
|
557
|
+
chunks = []
|
558
|
+
sampling_rate = None
|
559
|
+
async for chunk in generator:
|
560
|
+
if sampling_rate is None:
|
561
|
+
sampling_rate = chunk["sampling_rate"]
|
562
|
+
chunks.append(chunk["audio"])
|
563
|
+
|
564
|
+
return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
|
565
|
+
|
566
|
+
@retry_on_connection_error_async(max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger)
|
567
|
+
async def _generate_http_wrapper(self, body: Dict[str, Any]):
|
568
|
+
"""Need to wrap the http generator in a function for the retry decorator to work."""
|
569
|
+
try:
|
570
|
+
async for chunk in self._generate_http(body):
|
571
|
+
yield chunk
|
572
|
+
except Exception as e:
|
573
|
+
logger.error(f"Failed to generate audio. {e}")
|
574
|
+
raise e
|
575
|
+
|
576
|
+
async def _generate_http(self, body: Dict[str, Any]):
|
577
|
+
session = await self._get_session()
|
578
|
+
async with session.post(
|
579
|
+
f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
|
580
|
+
) as response:
|
581
|
+
if not response.ok:
|
582
|
+
raise ValueError(f"Failed to generate audio. {await response.text()}")
|
583
|
+
|
584
|
+
buffer = ""
|
585
|
+
async for chunk_bytes in response.content.iter_any():
|
586
|
+
buffer, outputs = update_buffer(buffer, chunk_bytes)
|
587
|
+
for output in outputs:
|
588
|
+
yield output
|
589
|
+
|
590
|
+
if buffer:
|
591
|
+
try:
|
592
|
+
chunk_json = json.loads(buffer)
|
593
|
+
audio = base64.b64decode(chunk_json["data"])
|
594
|
+
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
595
|
+
except json.JSONDecodeError:
|
596
|
+
pass
|
597
|
+
|
598
|
+
async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
|
599
|
+
include_context_id = bool(context_id)
|
600
|
+
route = "audio/websocket"
|
601
|
+
if self.experimental_ws_handle_interrupts:
|
602
|
+
route = f"experimental/{route}"
|
603
|
+
|
604
|
+
if not self.websocket or self._is_websocket_closed():
|
605
|
+
await self.refresh_websocket()
|
606
|
+
|
607
|
+
ws = self.websocket
|
608
|
+
if context_id is None:
|
609
|
+
context_id = uuid.uuid4().hex
|
610
|
+
await ws.send_json({"data": body, "context_id": context_id})
|
611
|
+
try:
|
612
|
+
response = None
|
613
|
+
while True:
|
614
|
+
response = await ws.receive_json()
|
615
|
+
if response["done"]:
|
616
|
+
break
|
617
|
+
|
618
|
+
yield convert_response(response, include_context_id)
|
619
|
+
|
620
|
+
if self.experimental_ws_handle_interrupts:
|
621
|
+
await ws.send_json({"context_id": context_id})
|
622
|
+
except GeneratorExit:
|
623
|
+
# The exit is only called when the generator is garbage collected.
|
624
|
+
# It may not be called directly after a break statement.
|
625
|
+
# However, the generator will be automatically cancelled on the next request.
|
626
|
+
if self.experimental_ws_handle_interrupts:
|
627
|
+
await ws.send_json({"context_id": context_id, "action": "cancel"})
|
628
|
+
except Exception as e:
|
629
|
+
if self.websocket and not self._is_websocket_closed():
|
630
|
+
await self.websocket.close()
|
631
|
+
raise RuntimeError(f"Failed to generate audio. {await response.text()}") from e
|
632
|
+
finally:
|
633
|
+
# Ensure the websocket is ultimately closed.
|
634
|
+
if self.websocket and not self._is_websocket_closed():
|
635
|
+
await self.websocket.close()
|
636
|
+
|
637
|
+
async def transcribe(self, raw_audio: Union[bytes, str]) -> str:
|
638
|
+
raw_audio_bytes, headers = self.prepare_audio_and_headers(raw_audio)
|
639
|
+
data = aiohttp.FormData()
|
640
|
+
data.add_field("clip", raw_audio_bytes, filename="input.wav", content_type="audio/wav")
|
641
|
+
session = await self._get_session()
|
642
|
+
|
643
|
+
async with session.post(
|
644
|
+
f"{self._http_url()}/audio/transcriptions", headers=headers, data=data
|
645
|
+
) as response:
|
646
|
+
if not response.ok:
|
647
|
+
raise ValueError(f"Failed to transcribe audio. Error: {await response.text()}")
|
648
|
+
|
649
|
+
transcript = await response.json()
|
650
|
+
return transcript["text"]
|
651
|
+
|
652
|
+
def __del__(self):
|
653
|
+
try:
|
654
|
+
loop = asyncio.get_running_loop()
|
655
|
+
except RuntimeError:
|
656
|
+
loop = None
|
657
|
+
|
658
|
+
if loop is None:
|
659
|
+
asyncio.run(self.close())
|
660
|
+
else:
|
661
|
+
loop.create_task(self.close())
|
662
|
+
|
663
|
+
async def __aenter__(self):
|
664
|
+
await self.refresh_websocket()
|
665
|
+
return self
|
666
|
+
|
667
|
+
async def __aexit__(
|
668
|
+
self,
|
669
|
+
exc_type: Union[type, None],
|
670
|
+
exc: Union[BaseException, None],
|
671
|
+
exc_tb: Union[TracebackType, None],
|
672
|
+
):
|
673
|
+
await self.close()
|
cartesia/utils.py
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
from aiohttp.client_exceptions import ServerDisconnectedError
|
4
|
+
import asyncio
|
5
|
+
from functools import wraps
|
6
|
+
from http.client import RemoteDisconnected
|
7
|
+
from httpx import TimeoutException
|
8
|
+
from requests.exceptions import ConnectionError
|
9
|
+
|
10
|
+
def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
|
11
|
+
"""Retry a function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
max_retries (int): The maximum number of retries.
|
15
|
+
backoff_factor (int): The factor to increase the delay between retries.
|
16
|
+
logger (logging.Logger): The logger to use for logging.
|
17
|
+
"""
|
18
|
+
def decorator(func):
|
19
|
+
@wraps(func)
|
20
|
+
def wrapper(*args, **kwargs):
|
21
|
+
retry_count = 0
|
22
|
+
while retry_count < max_retries:
|
23
|
+
try:
|
24
|
+
return func(*args, **kwargs)
|
25
|
+
except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
|
26
|
+
logger.info(f"Retrying after exception: {e}")
|
27
|
+
retry_count += 1
|
28
|
+
if retry_count < max_retries:
|
29
|
+
delay = backoff_factor * (2 ** (retry_count - 1))
|
30
|
+
logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
|
31
|
+
time.sleep(delay)
|
32
|
+
else:
|
33
|
+
raise Exception(f"Exception occurred after {max_retries} tries.") from e
|
34
|
+
return wrapper
|
35
|
+
return decorator
|
36
|
+
|
37
|
+
def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None):
|
38
|
+
"""Retry an asynchronous function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
max_retries (int): The maximum number of retries.
|
42
|
+
backoff_factor (int): The factor to increase the delay between retries.
|
43
|
+
logger (logging.Logger): The logger to use for logging.
|
44
|
+
"""
|
45
|
+
def decorator(func):
|
46
|
+
@wraps(func)
|
47
|
+
async def wrapper(*args, **kwargs):
|
48
|
+
retry_count = 0
|
49
|
+
while retry_count < max_retries:
|
50
|
+
try:
|
51
|
+
async for chunk in func(*args, **kwargs):
|
52
|
+
yield chunk
|
53
|
+
# If the function completes without raising an exception return
|
54
|
+
return
|
55
|
+
except (ConnectionError, RemoteDisconnected, ServerDisconnectedError, TimeoutException) as e:
|
56
|
+
logger.info(f"Retrying after exception: {e}")
|
57
|
+
retry_count += 1
|
58
|
+
if retry_count < max_retries:
|
59
|
+
delay = backoff_factor * (2 ** (retry_count - 1))
|
60
|
+
logger.warn(f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds...")
|
61
|
+
await asyncio.sleep(delay)
|
62
|
+
else:
|
63
|
+
raise Exception(f"Exception occurred after {max_retries} tries.") from e
|
64
|
+
return wrapper
|
65
|
+
return decorator
|
cartesia/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.
|
1
|
+
__version__ = "0.0.5"
|
@@ -0,0 +1,187 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: cartesia
|
3
|
+
Version: 0.0.5
|
4
|
+
Summary: The official Python library for the Cartesia API.
|
5
|
+
Home-page:
|
6
|
+
Author: Cartesia, Inc.
|
7
|
+
Author-email: support@cartesia.ai
|
8
|
+
Classifier: Programming Language :: Python
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
|
+
Requires-Python: >=3.8.0
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
Requires-Dist: aiohttp
|
14
|
+
Requires-Dist: httpx
|
15
|
+
Requires-Dist: pytest-asyncio
|
16
|
+
Requires-Dist: requests
|
17
|
+
Requires-Dist: websockets
|
18
|
+
Provides-Extra: all
|
19
|
+
Requires-Dist: pytest >=8.0.2 ; extra == 'all'
|
20
|
+
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
|
21
|
+
Requires-Dist: twine ; extra == 'all'
|
22
|
+
Requires-Dist: setuptools ; extra == 'all'
|
23
|
+
Requires-Dist: wheel ; extra == 'all'
|
24
|
+
Provides-Extra: dev
|
25
|
+
Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
|
26
|
+
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
|
27
|
+
Requires-Dist: twine ; extra == 'dev'
|
28
|
+
Requires-Dist: setuptools ; extra == 'dev'
|
29
|
+
Requires-Dist: wheel ; extra == 'dev'
|
30
|
+
|
31
|
+
|
32
|
+
# Cartesia Python API Library
|
33
|
+
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
34
|
+
|
35
|
+
**Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
|
36
|
+
|
37
|
+
## Installation
|
38
|
+
```bash
|
39
|
+
pip install cartesia
|
40
|
+
|
41
|
+
# pip install in editable mode w/ dev dependencies
|
42
|
+
pip install -e '.[dev]'
|
43
|
+
```
|
44
|
+
|
45
|
+
## Usage
|
46
|
+
```python
|
47
|
+
from cartesia.tts import CartesiaTTS
|
48
|
+
import pyaudio
|
49
|
+
import os
|
50
|
+
|
51
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
52
|
+
voices = client.get_voices()
|
53
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
54
|
+
transcript = "Hello! Welcome to Cartesia"
|
55
|
+
model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
|
56
|
+
|
57
|
+
p = pyaudio.PyAudio()
|
58
|
+
|
59
|
+
stream = None
|
60
|
+
|
61
|
+
# Generate and stream audio
|
62
|
+
for output in client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
|
63
|
+
buffer = output["audio"]
|
64
|
+
rate = output["sampling_rate"]
|
65
|
+
|
66
|
+
if not stream:
|
67
|
+
stream = p.open(format=pyaudio.paFloat32,
|
68
|
+
channels=1,
|
69
|
+
rate=rate,
|
70
|
+
output=True)
|
71
|
+
|
72
|
+
# Write the audio data to the stream
|
73
|
+
stream.write(buffer)
|
74
|
+
|
75
|
+
stream.stop_stream()
|
76
|
+
stream.close()
|
77
|
+
p.terminate()
|
78
|
+
```
|
79
|
+
|
80
|
+
You can also use the async client if you want to make asynchronous API calls:
|
81
|
+
```python
|
82
|
+
from cartesia.tts import AsyncCartesiaTTS
|
83
|
+
import asyncio
|
84
|
+
import pyaudio
|
85
|
+
import os
|
86
|
+
|
87
|
+
async def write_stream():
|
88
|
+
client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
89
|
+
voices = client.get_voices()
|
90
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
91
|
+
transcript = "Hello! Welcome to Cartesia"
|
92
|
+
model_id = "genial-planet-1346" # (Optional) We'll specify a default if you don't have a specific model in mind
|
93
|
+
|
94
|
+
p = pyaudio.PyAudio()
|
95
|
+
|
96
|
+
stream = None
|
97
|
+
|
98
|
+
# Generate and stream audio
|
99
|
+
async for output in await client.generate(transcript=transcript, voice=voice, model_id=model_id, stream=True):
|
100
|
+
buffer = output["audio"]
|
101
|
+
rate = output["sampling_rate"]
|
102
|
+
|
103
|
+
if not stream:
|
104
|
+
stream = p.open(format=pyaudio.paFloat32,
|
105
|
+
channels=1,
|
106
|
+
rate=rate,
|
107
|
+
output=True)
|
108
|
+
|
109
|
+
# Write the audio data to the stream
|
110
|
+
stream.write(buffer)
|
111
|
+
|
112
|
+
stream.stop_stream()
|
113
|
+
stream.close()
|
114
|
+
p.terminate()
|
115
|
+
|
116
|
+
asyncio.run(write_stream())
|
117
|
+
```
|
118
|
+
|
119
|
+
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
|
120
|
+
Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
|
121
|
+
|
122
|
+
```python
|
123
|
+
from IPython.display import Audio
|
124
|
+
import io
|
125
|
+
import os
|
126
|
+
import numpy as np
|
127
|
+
|
128
|
+
from cartesia.tts import CartesiaTTS
|
129
|
+
|
130
|
+
with CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
|
131
|
+
voices = client.get_voices()
|
132
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
133
|
+
transcript = "Hello! Welcome to Cartesia"
|
134
|
+
|
135
|
+
# Create a BytesIO object to store the audio data
|
136
|
+
audio_data = io.BytesIO()
|
137
|
+
|
138
|
+
# Generate and stream audio
|
139
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
140
|
+
buffer = output["audio"]
|
141
|
+
audio_data.write(buffer)
|
142
|
+
|
143
|
+
# Set the cursor position to the beginning of the BytesIO object
|
144
|
+
audio_data.seek(0)
|
145
|
+
|
146
|
+
# Create an Audio object from the BytesIO data
|
147
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
148
|
+
|
149
|
+
# Display the Audio object
|
150
|
+
display(audio)
|
151
|
+
```
|
152
|
+
|
153
|
+
Below is the same example using the async client:
|
154
|
+
```python
|
155
|
+
from IPython.display import Audio
|
156
|
+
import io
|
157
|
+
import os
|
158
|
+
import numpy as np
|
159
|
+
|
160
|
+
from cartesia.tts import AsyncCartesiaTTS
|
161
|
+
|
162
|
+
async with AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
|
163
|
+
voices = client.get_voices()
|
164
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
165
|
+
transcript = "Hello! Welcome to Cartesia"
|
166
|
+
|
167
|
+
# Create a BytesIO object to store the audio data
|
168
|
+
audio_data = io.BytesIO()
|
169
|
+
|
170
|
+
# Generate and stream audio
|
171
|
+
async for output in await client.generate(transcript=transcript, voice=voice, stream=True):
|
172
|
+
buffer = output["audio"]
|
173
|
+
audio_data.write(buffer)
|
174
|
+
|
175
|
+
# Set the cursor position to the beginning of the BytesIO object
|
176
|
+
audio_data.seek(0)
|
177
|
+
|
178
|
+
# Create an Audio object from the BytesIO data
|
179
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
180
|
+
|
181
|
+
# Display the Audio object
|
182
|
+
display(audio)
|
183
|
+
```
|
184
|
+
|
185
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
186
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
187
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -0,0 +1,8 @@
|
|
1
|
+
cartesia/__init__.py,sha256=uIc9xGNPs8_A6eAvbTUY1geazunYoEZVWFKhCwC9TRA,102
|
2
|
+
cartesia/tts.py,sha256=Gtm9qse83g3SX4-KbmlxOAvTQcZIjmUkMfBKu2Xf9rY,26449
|
3
|
+
cartesia/utils.py,sha256=GoTJe8LZ3WpS4hXkwoZauPYjo7Mbx7BvbBjAX5vEbwg,3024
|
4
|
+
cartesia/version.py,sha256=S7u1lbuWmM3A3ajykBialmPoJUK6Jg-WmNqM-9OZFdk,22
|
5
|
+
cartesia-0.0.5.dist-info/METADATA,sha256=oK64bcTyLhrosXh9FjuEwB2SUdQzbYsxzOWCnf6qaI4,5974
|
6
|
+
cartesia-0.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
|
7
|
+
cartesia-0.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
8
|
+
cartesia-0.0.5.dist-info/RECORD,,
|
@@ -1,113 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: cartesia
|
3
|
-
Version: 0.0.3
|
4
|
-
Summary: The official Python library for the Cartesia API.
|
5
|
-
Home-page:
|
6
|
-
Author: Cartesia, Inc.
|
7
|
-
Author-email: support@cartesia.ai
|
8
|
-
Classifier: Programming Language :: Python
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
10
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
|
-
Requires-Python: >=3.8.0
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
Requires-Dist: websockets
|
14
|
-
Requires-Dist: requests
|
15
|
-
Provides-Extra: all
|
16
|
-
Requires-Dist: pre-commit ; extra == 'all'
|
17
|
-
Requires-Dist: docformatter ; extra == 'all'
|
18
|
-
Requires-Dist: black ==24.1.1 ; extra == 'all'
|
19
|
-
Requires-Dist: isort ==5.13.2 ; extra == 'all'
|
20
|
-
Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
|
21
|
-
Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
|
22
|
-
Requires-Dist: pytest >=8.0.2 ; extra == 'all'
|
23
|
-
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
|
24
|
-
Provides-Extra: dev
|
25
|
-
Requires-Dist: pre-commit ; extra == 'dev'
|
26
|
-
Requires-Dist: docformatter ; extra == 'dev'
|
27
|
-
Requires-Dist: black ==24.1.1 ; extra == 'dev'
|
28
|
-
Requires-Dist: isort ==5.13.2 ; extra == 'dev'
|
29
|
-
Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
|
30
|
-
Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
|
31
|
-
Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
|
32
|
-
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
|
33
|
-
|
34
|
-
|
35
|
-
# Cartesia Python API Library
|
36
|
-
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
37
|
-
|
38
|
-
**Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
|
39
|
-
|
40
|
-
## Installation
|
41
|
-
```bash
|
42
|
-
pip install cartesia
|
43
|
-
|
44
|
-
# pip install in editable mode w/ dev dependencies
|
45
|
-
pip install -e '.[dev]'
|
46
|
-
```
|
47
|
-
|
48
|
-
## Usage
|
49
|
-
```python
|
50
|
-
from cartesia.tts import CartesiaTTS
|
51
|
-
import pyaudio
|
52
|
-
import os
|
53
|
-
|
54
|
-
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
55
|
-
voices = client.get_voices()
|
56
|
-
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
57
|
-
transcript = "Hello! Welcome to Cartesia"
|
58
|
-
|
59
|
-
p = pyaudio.PyAudio()
|
60
|
-
|
61
|
-
stream = None
|
62
|
-
|
63
|
-
# Generate and stream audio
|
64
|
-
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
65
|
-
buffer = output["audio"]
|
66
|
-
rate = output["sampling_rate"]
|
67
|
-
|
68
|
-
if not stream:
|
69
|
-
stream = p.open(format=pyaudio.paFloat32,
|
70
|
-
channels=1,
|
71
|
-
rate=rate,
|
72
|
-
output=True)
|
73
|
-
|
74
|
-
# Write the audio data to the stream
|
75
|
-
stream.write(buffer)
|
76
|
-
|
77
|
-
stream.stop_stream()
|
78
|
-
stream.close()
|
79
|
-
p.terminate()
|
80
|
-
```
|
81
|
-
|
82
|
-
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
|
83
|
-
|
84
|
-
```python
|
85
|
-
from cartesia.tts import CartesiaTTS
|
86
|
-
from IPython.display import Audio
|
87
|
-
import io
|
88
|
-
import os
|
89
|
-
|
90
|
-
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
91
|
-
voices = client.get_voices()
|
92
|
-
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
93
|
-
transcript = "Hello! Welcome to Cartesia"
|
94
|
-
|
95
|
-
# Create a BytesIO object to store the audio data
|
96
|
-
audio_data = io.BytesIO()
|
97
|
-
|
98
|
-
# Generate and stream audio
|
99
|
-
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
100
|
-
buffer = output["audio"]
|
101
|
-
audio_data.write(buffer)
|
102
|
-
|
103
|
-
# Set the cursor position to the beginning of the BytesIO object
|
104
|
-
audio_data.seek(0)
|
105
|
-
|
106
|
-
# Create an Audio object from the BytesIO data
|
107
|
-
audio = Audio(audio_data, rate=output["sampling_rate"])
|
108
|
-
|
109
|
-
# Display the Audio object
|
110
|
-
display(audio)
|
111
|
-
```
|
112
|
-
|
113
|
-
We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
|
cartesia-0.0.3.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
|
2
|
-
cartesia/tts.py,sha256=ABXW9rc8Pn0GTRvb_7DHZKMtbvhGUiqOgHmvztwlOnI,12033
|
3
|
-
cartesia/version.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
|
4
|
-
cartesia-0.0.3.dist-info/METADATA,sha256=VsCGL1sITbKqERihK2rzVm9WIY5EJ5nCS_CXQ0s14ns,3604
|
5
|
-
cartesia-0.0.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
6
|
-
cartesia-0.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
7
|
-
cartesia-0.0.3.dist-info/RECORD,,
|
File without changes
|