cartesia 0.0.3__tar.gz → 0.0.5rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/PKG-INFO +41 -4
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/README.md +34 -2
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia/tts.py +256 -40
- cartesia-0.0.5rc1/cartesia/version.py +1 -0
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia.egg-info/PKG-INFO +41 -4
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia.egg-info/requires.txt +6 -1
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/setup.py +46 -18
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/tests/test_tts.py +95 -3
- cartesia-0.0.3/cartesia/version.py +0 -1
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia/__init__.py +0 -0
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia.egg-info/SOURCES.txt +0 -0
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/pyproject.toml +0 -0
- {cartesia-0.0.3 → cartesia-0.0.5rc1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5rc1
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -10,8 +10,11 @@ Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
11
|
Requires-Python: >=3.8.0
|
12
12
|
Description-Content-Type: text/markdown
|
13
|
-
Requires-Dist:
|
13
|
+
Requires-Dist: aiohttp
|
14
|
+
Requires-Dist: httpx
|
15
|
+
Requires-Dist: pytest-asyncio
|
14
16
|
Requires-Dist: requests
|
17
|
+
Requires-Dist: websockets
|
15
18
|
Provides-Extra: dev
|
16
19
|
Requires-Dist: pre-commit; extra == "dev"
|
17
20
|
Requires-Dist: docformatter; extra == "dev"
|
@@ -21,6 +24,7 @@ Requires-Dist: flake8==7.0.0; extra == "dev"
|
|
21
24
|
Requires-Dist: flake8-bugbear==24.2.6; extra == "dev"
|
22
25
|
Requires-Dist: pytest>=8.0.2; extra == "dev"
|
23
26
|
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
27
|
+
Requires-Dist: twine; extra == "dev"
|
24
28
|
Provides-Extra: all
|
25
29
|
Requires-Dist: pre-commit; extra == "all"
|
26
30
|
Requires-Dist: docformatter; extra == "all"
|
@@ -30,6 +34,7 @@ Requires-Dist: flake8==7.0.0; extra == "all"
|
|
30
34
|
Requires-Dist: flake8-bugbear==24.2.6; extra == "all"
|
31
35
|
Requires-Dist: pytest>=8.0.2; extra == "all"
|
32
36
|
Requires-Dist: pytest-cov>=4.1.0; extra == "all"
|
37
|
+
Requires-Dist: twine; extra == "all"
|
33
38
|
|
34
39
|
|
35
40
|
# Cartesia Python API Library
|
@@ -104,10 +109,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
|
104
109
|
audio_data.seek(0)
|
105
110
|
|
106
111
|
# Create an Audio object from the BytesIO data
|
107
|
-
audio = Audio(audio_data, rate=output["sampling_rate"])
|
112
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
113
|
+
|
114
|
+
# Display the Audio object
|
115
|
+
display(audio)
|
116
|
+
```
|
117
|
+
|
118
|
+
You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
|
119
|
+
```python
|
120
|
+
from cartesia.tts import AsyncCartesiaTTS
|
121
|
+
from IPython.display import Audio
|
122
|
+
import io
|
123
|
+
import os
|
124
|
+
|
125
|
+
client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
126
|
+
voices = client.get_voices()
|
127
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
128
|
+
transcript = "Hello! Welcome to Cartesia"
|
129
|
+
|
130
|
+
# Create a BytesIO object to store the audio data
|
131
|
+
audio_data = io.BytesIO()
|
132
|
+
|
133
|
+
# Generate and stream audio
|
134
|
+
async for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
135
|
+
buffer = output["audio"]
|
136
|
+
audio_data.write(buffer)
|
137
|
+
|
138
|
+
# Set the cursor position to the beginning of the BytesIO object
|
139
|
+
audio_data.seek(0)
|
140
|
+
|
141
|
+
# Create an Audio object from the BytesIO data
|
142
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
108
143
|
|
109
144
|
# Display the Audio object
|
110
145
|
display(audio)
|
111
146
|
```
|
112
147
|
|
113
|
-
|
148
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
149
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
150
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -70,10 +70,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
|
70
70
|
audio_data.seek(0)
|
71
71
|
|
72
72
|
# Create an Audio object from the BytesIO data
|
73
|
-
audio = Audio(audio_data, rate=output["sampling_rate"])
|
73
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
74
74
|
|
75
75
|
# Display the Audio object
|
76
76
|
display(audio)
|
77
77
|
```
|
78
78
|
|
79
|
-
|
79
|
+
You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
|
80
|
+
```python
|
81
|
+
from cartesia.tts import AsyncCartesiaTTS
|
82
|
+
from IPython.display import Audio
|
83
|
+
import io
|
84
|
+
import os
|
85
|
+
|
86
|
+
client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
87
|
+
voices = client.get_voices()
|
88
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
89
|
+
transcript = "Hello! Welcome to Cartesia"
|
90
|
+
|
91
|
+
# Create a BytesIO object to store the audio data
|
92
|
+
audio_data = io.BytesIO()
|
93
|
+
|
94
|
+
# Generate and stream audio
|
95
|
+
async for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
96
|
+
buffer = output["audio"]
|
97
|
+
audio_data.write(buffer)
|
98
|
+
|
99
|
+
# Set the cursor position to the beginning of the BytesIO object
|
100
|
+
audio_data.seek(0)
|
101
|
+
|
102
|
+
# Create an Audio object from the BytesIO data
|
103
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
104
|
+
|
105
|
+
# Display the Audio object
|
106
|
+
display(audio)
|
107
|
+
```
|
108
|
+
|
109
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
110
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
111
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -1,15 +1,20 @@
|
|
1
|
+
import asyncio
|
1
2
|
import base64
|
2
3
|
import json
|
3
4
|
import os
|
4
5
|
import uuid
|
5
|
-
from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
|
6
|
+
from typing import Any, AsyncGenerator, Dict, Generator, List, Optional, Tuple, TypedDict, Union
|
6
7
|
|
8
|
+
import aiohttp
|
9
|
+
import httpx
|
7
10
|
import requests
|
8
11
|
from websockets.sync.client import connect
|
9
12
|
|
10
13
|
DEFAULT_MODEL_ID = "genial-planet-1346"
|
11
14
|
DEFAULT_BASE_URL = "api.cartesia.ai"
|
12
15
|
DEFAULT_API_VERSION = "v0"
|
16
|
+
DEFAULT_TIMEOUT = 60 # seconds
|
17
|
+
DEFAULT_NUM_CONNECTIONS = 10 # connections per client
|
13
18
|
|
14
19
|
|
15
20
|
class AudioOutput(TypedDict):
|
@@ -27,11 +32,46 @@ class VoiceMetadata(TypedDict):
|
|
27
32
|
embedding: Optional[Embedding]
|
28
33
|
|
29
34
|
|
35
|
+
def update_buffer(buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
|
36
|
+
buffer += chunk_bytes.decode("utf-8")
|
37
|
+
outputs = []
|
38
|
+
while "{" in buffer and "}" in buffer:
|
39
|
+
start_index = buffer.find("{")
|
40
|
+
end_index = buffer.find("}", start_index)
|
41
|
+
if start_index != -1 and end_index != -1:
|
42
|
+
try:
|
43
|
+
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
44
|
+
audio = base64.b64decode(chunk_json["data"])
|
45
|
+
outputs.append({"audio": audio, "sampling_rate": chunk_json["sampling_rate"]})
|
46
|
+
buffer = buffer[end_index + 1 :]
|
47
|
+
except json.JSONDecodeError:
|
48
|
+
break
|
49
|
+
return buffer, outputs
|
50
|
+
|
51
|
+
|
52
|
+
def convert_response(response: Dict[str, any], include_context_id: bool) -> Dict[str, Any]:
|
53
|
+
audio = base64.b64decode(response["data"])
|
54
|
+
|
55
|
+
optional_kwargs = {}
|
56
|
+
if include_context_id:
|
57
|
+
optional_kwargs["context_id"] = response["context_id"]
|
58
|
+
|
59
|
+
return {
|
60
|
+
"audio": audio,
|
61
|
+
"sampling_rate": response["sampling_rate"],
|
62
|
+
**optional_kwargs,
|
63
|
+
}
|
64
|
+
|
65
|
+
|
30
66
|
class CartesiaTTS:
|
31
67
|
"""The client for Cartesia's text-to-speech library.
|
32
68
|
|
33
69
|
This client contains methods to interact with the Cartesia text-to-speech API.
|
34
|
-
The
|
70
|
+
The client can be used to retrieve available voices, compute new voice embeddings,
|
71
|
+
and generate speech from text.
|
72
|
+
|
73
|
+
The client also supports generating audio using a websocket for lower latency.
|
74
|
+
To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
|
35
75
|
|
36
76
|
Examples:
|
37
77
|
|
@@ -55,18 +95,22 @@ class CartesiaTTS:
|
|
55
95
|
... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
|
56
96
|
"""
|
57
97
|
|
58
|
-
def __init__(self, *, api_key: str = None):
|
98
|
+
def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
|
59
99
|
"""
|
60
100
|
Args:
|
61
101
|
api_key: The API key to use for authorization.
|
62
102
|
If not specified, the API key will be read from the environment variable
|
63
103
|
`CARTESIA_API_KEY`.
|
104
|
+
experimental_ws_handle_interrupts: Whether to handle interrupts when generating
|
105
|
+
audio using the websocket. This is an experimental feature and may have bugs
|
106
|
+
or be deprecated in the future.
|
64
107
|
"""
|
65
108
|
self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
|
66
109
|
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
67
110
|
self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
|
68
111
|
self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
|
69
112
|
self.websocket = None
|
113
|
+
self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
|
70
114
|
self.refresh_websocket()
|
71
115
|
|
72
116
|
def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
|
@@ -100,9 +144,9 @@ class CartesiaTTS:
|
|
100
144
|
>>> audio = client.generate(transcript="Hello world!", voice=embedding)
|
101
145
|
"""
|
102
146
|
params = {"select": "id, name, description"} if skip_embeddings else None
|
103
|
-
response =
|
147
|
+
response = httpx.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
|
104
148
|
|
105
|
-
if response.
|
149
|
+
if not response.is_success:
|
106
150
|
raise ValueError(f"Failed to get voices. Error: {response.text}")
|
107
151
|
|
108
152
|
voices = response.json()
|
@@ -134,20 +178,20 @@ class CartesiaTTS:
|
|
134
178
|
|
135
179
|
if voice_id:
|
136
180
|
url = f"{self._http_url()}/voices/embedding/{voice_id}"
|
137
|
-
response =
|
181
|
+
response = httpx.get(url, headers=self.headers)
|
138
182
|
elif filepath:
|
139
183
|
url = f"{self._http_url()}/voices/clone/clip"
|
140
184
|
files = {"clip": open(filepath, "rb")}
|
141
185
|
headers = self.headers.copy()
|
142
186
|
# The default content type of JSON is incorrect for file uploads
|
143
187
|
headers.pop("Content-Type")
|
144
|
-
response =
|
188
|
+
response = httpx.post(url, headers=headers, files=files)
|
145
189
|
elif link:
|
146
190
|
url = f"{self._http_url()}/voices/clone/url"
|
147
191
|
params = {"link": link}
|
148
|
-
response =
|
192
|
+
response = httpx.post(url, headers=self.headers, params=params)
|
149
193
|
|
150
|
-
if response.
|
194
|
+
if not response.is_success:
|
151
195
|
raise ValueError(
|
152
196
|
f"Failed to clone voice. Status Code: {response.status_code}\n"
|
153
197
|
f"Error: {response.text}"
|
@@ -167,8 +211,11 @@ class CartesiaTTS:
|
|
167
211
|
"""
|
168
212
|
if self.websocket and not self._is_websocket_closed():
|
169
213
|
self.websocket.close()
|
214
|
+
route = "audio/websocket"
|
215
|
+
if self.experimental_ws_handle_interrupts:
|
216
|
+
route = f"experimental/{route}"
|
170
217
|
self.websocket = connect(
|
171
|
-
f"{self._ws_url()}/
|
218
|
+
f"{self._ws_url()}/{route}?api_key={self.api_key}",
|
172
219
|
close_timeout=None,
|
173
220
|
)
|
174
221
|
|
@@ -189,6 +236,29 @@ class CartesiaTTS:
|
|
189
236
|
if transcript.strip() == "":
|
190
237
|
raise ValueError("`transcript` must be non empty")
|
191
238
|
|
239
|
+
def _generate_request_body(
|
240
|
+
self,
|
241
|
+
*,
|
242
|
+
transcript: str,
|
243
|
+
duration: int = None,
|
244
|
+
chunk_time: float = None,
|
245
|
+
voice: Embedding = None,
|
246
|
+
) -> Dict[str, Any]:
|
247
|
+
"""
|
248
|
+
Create the request body for a stream request.
|
249
|
+
Note that anything that's not provided will use a default if available or be filtered out otherwise.
|
250
|
+
"""
|
251
|
+
body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=voice)
|
252
|
+
|
253
|
+
optional_body = dict(
|
254
|
+
duration=duration,
|
255
|
+
chunk_time=chunk_time,
|
256
|
+
voice=voice,
|
257
|
+
)
|
258
|
+
body.update({k: v for k, v in optional_body.items() if v is not None})
|
259
|
+
|
260
|
+
return body
|
261
|
+
|
192
262
|
def generate(
|
193
263
|
self,
|
194
264
|
*,
|
@@ -221,14 +291,9 @@ class CartesiaTTS:
|
|
221
291
|
"""
|
222
292
|
self._check_inputs(transcript, duration, chunk_time)
|
223
293
|
|
224
|
-
body =
|
225
|
-
|
226
|
-
optional_body = dict(
|
227
|
-
duration=duration,
|
228
|
-
chunk_time=chunk_time,
|
229
|
-
voice=voice,
|
294
|
+
body = self._generate_request_body(
|
295
|
+
transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
|
230
296
|
)
|
231
|
-
body.update({k: v for k, v in optional_body.items() if v is not None})
|
232
297
|
|
233
298
|
if websocket:
|
234
299
|
generator = self._generate_ws(body)
|
@@ -254,23 +319,14 @@ class CartesiaTTS:
|
|
254
319
|
data=json.dumps(body),
|
255
320
|
headers=self.headers,
|
256
321
|
)
|
257
|
-
if response.
|
322
|
+
if not response.ok:
|
258
323
|
raise ValueError(f"Failed to generate audio. {response.text}")
|
259
324
|
|
260
325
|
buffer = ""
|
261
326
|
for chunk_bytes in response.iter_content(chunk_size=None):
|
262
|
-
buffer
|
263
|
-
|
264
|
-
|
265
|
-
end_index = buffer.find("}", start_index)
|
266
|
-
if start_index != -1 and end_index != -1:
|
267
|
-
try:
|
268
|
-
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
269
|
-
audio = base64.b64decode(chunk_json["data"])
|
270
|
-
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
271
|
-
buffer = buffer[end_index + 1 :]
|
272
|
-
except json.JSONDecodeError:
|
273
|
-
break
|
327
|
+
buffer, outputs = update_buffer(buffer, chunk_bytes)
|
328
|
+
for output in outputs:
|
329
|
+
yield output
|
274
330
|
|
275
331
|
if buffer:
|
276
332
|
try:
|
@@ -280,21 +336,41 @@ class CartesiaTTS:
|
|
280
336
|
except json.JSONDecodeError:
|
281
337
|
pass
|
282
338
|
|
283
|
-
def _generate_ws(self, body: Dict[str, Any]):
|
339
|
+
def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
|
340
|
+
"""Generate audio using the websocket connection.
|
341
|
+
|
342
|
+
Args:
|
343
|
+
body: The request body.
|
344
|
+
context_id: The context id for the request.
|
345
|
+
The context id must be globally unique for the duration this client exists.
|
346
|
+
If this is provided, the context id that is in the response will
|
347
|
+
also be returned as part of the dict. This is helpful for testing.
|
348
|
+
"""
|
284
349
|
if not self.websocket or self._is_websocket_closed():
|
285
350
|
self.refresh_websocket()
|
286
351
|
|
287
|
-
|
352
|
+
include_context_id = bool(context_id)
|
353
|
+
if context_id is None:
|
354
|
+
context_id = uuid.uuid4().hex
|
355
|
+
self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
|
288
356
|
try:
|
289
|
-
|
290
|
-
while not response["done"]:
|
291
|
-
audio = base64.b64decode(response["data"])
|
292
|
-
# print("timing", time.perf_counter() - start)
|
293
|
-
yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
|
294
|
-
|
357
|
+
while True:
|
295
358
|
response = json.loads(self.websocket.recv())
|
296
|
-
|
297
|
-
|
359
|
+
if response["done"]:
|
360
|
+
break
|
361
|
+
|
362
|
+
yield convert_response(response, include_context_id)
|
363
|
+
|
364
|
+
if self.experimental_ws_handle_interrupts:
|
365
|
+
self.websocket.send(json.dumps({"context_id": context_id}))
|
366
|
+
except GeneratorExit:
|
367
|
+
# The exit is only called when the generator is garbage collected.
|
368
|
+
# It may not be called directly after a break statement.
|
369
|
+
# However, the generator will be automatically cancelled on the next request.
|
370
|
+
if self.experimental_ws_handle_interrupts:
|
371
|
+
self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
|
372
|
+
except Exception as e:
|
373
|
+
raise RuntimeError(f"Failed to generate audio. {response}") from e
|
298
374
|
|
299
375
|
def _http_url(self):
|
300
376
|
prefix = "http" if "localhost" in self.base_url else "https"
|
@@ -307,3 +383,143 @@ class CartesiaTTS:
|
|
307
383
|
def __del__(self):
|
308
384
|
if self.websocket.socket.fileno() > -1:
|
309
385
|
self.websocket.close()
|
386
|
+
|
387
|
+
|
388
|
+
class AsyncCartesiaTTS(CartesiaTTS):
|
389
|
+
def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
|
390
|
+
self.timeout = aiohttp.ClientTimeout(total=DEFAULT_TIMEOUT)
|
391
|
+
self.connector = aiohttp.TCPConnector(limit=DEFAULT_NUM_CONNECTIONS)
|
392
|
+
self._session = aiohttp.ClientSession(timeout=self.timeout, connector=self.connector)
|
393
|
+
super().__init__(
|
394
|
+
api_key=api_key, experimental_ws_handle_interrupts=experimental_ws_handle_interrupts
|
395
|
+
)
|
396
|
+
|
397
|
+
def refresh_websocket(self):
|
398
|
+
pass # do not load the websocket for the client until asynchronously when it is needed
|
399
|
+
|
400
|
+
async def _async_refresh_websocket(self):
|
401
|
+
"""Refresh the websocket connection."""
|
402
|
+
if self.websocket and not self._is_websocket_closed():
|
403
|
+
self.websocket.close()
|
404
|
+
route = "audio/websocket"
|
405
|
+
if self.experimental_ws_handle_interrupts:
|
406
|
+
route = f"experimental/{route}"
|
407
|
+
self.websocket = await self._session.ws_connect(
|
408
|
+
f"{self._ws_url()}/{route}?api_key={self.api_key}"
|
409
|
+
)
|
410
|
+
|
411
|
+
async def generate(
|
412
|
+
self,
|
413
|
+
*,
|
414
|
+
transcript: str,
|
415
|
+
duration: int = None,
|
416
|
+
chunk_time: float = None,
|
417
|
+
voice: Embedding = None,
|
418
|
+
stream: bool = False,
|
419
|
+
websocket: bool = True,
|
420
|
+
) -> Union[AudioOutput, AsyncGenerator[AudioOutput, None]]:
|
421
|
+
"""Asynchronously generate audio from a transcript.
|
422
|
+
NOTE: This overrides the non-asynchronous generate method from the base class.
|
423
|
+
Args:
|
424
|
+
transcript: The text to generate audio for.
|
425
|
+
voice: The embedding to use for generating audio.
|
426
|
+
options: The options to use for generating audio. See :class:`GenerateOptions`.
|
427
|
+
Returns:
|
428
|
+
A dictionary containing the following:
|
429
|
+
* "audio": The audio as a 1D numpy array.
|
430
|
+
* "sampling_rate": The sampling rate of the audio.
|
431
|
+
"""
|
432
|
+
body = self._generate_request_body(
|
433
|
+
transcript=transcript, duration=duration, chunk_time=chunk_time, voice=voice
|
434
|
+
)
|
435
|
+
|
436
|
+
if websocket:
|
437
|
+
generator = self._generate_ws(body)
|
438
|
+
else:
|
439
|
+
generator = self._generate_http(body)
|
440
|
+
|
441
|
+
if stream:
|
442
|
+
return generator
|
443
|
+
|
444
|
+
chunks = []
|
445
|
+
sampling_rate = None
|
446
|
+
async for chunk in generator:
|
447
|
+
if sampling_rate is None:
|
448
|
+
sampling_rate = chunk["sampling_rate"]
|
449
|
+
chunks.append(chunk["audio"])
|
450
|
+
|
451
|
+
return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
|
452
|
+
|
453
|
+
async def _generate_http(self, body: Dict[str, Any]):
|
454
|
+
async with self._session.post(
|
455
|
+
f"{self._http_url()}/audio/stream", data=json.dumps(body), headers=self.headers
|
456
|
+
) as response:
|
457
|
+
if response.status < 200 or response.status >= 300:
|
458
|
+
raise ValueError(f"Failed to generate audio. {response.text}")
|
459
|
+
|
460
|
+
buffer = ""
|
461
|
+
async for chunk_bytes in response.content.iter_any():
|
462
|
+
buffer, outputs = update_buffer(buffer, chunk_bytes)
|
463
|
+
for output in outputs:
|
464
|
+
yield output
|
465
|
+
|
466
|
+
if buffer:
|
467
|
+
try:
|
468
|
+
chunk_json = json.loads(buffer)
|
469
|
+
audio = base64.b64decode(chunk_json["data"])
|
470
|
+
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
471
|
+
except json.JSONDecodeError:
|
472
|
+
pass
|
473
|
+
|
474
|
+
async def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
|
475
|
+
include_context_id = bool(context_id)
|
476
|
+
route = "audio/websocket"
|
477
|
+
if self.experimental_ws_handle_interrupts:
|
478
|
+
route = f"experimental/{route}"
|
479
|
+
|
480
|
+
if not self.websocket or self._is_websocket_closed():
|
481
|
+
await self._async_refresh_websocket()
|
482
|
+
|
483
|
+
ws = self.websocket
|
484
|
+
if context_id is None:
|
485
|
+
context_id = uuid.uuid4().hex
|
486
|
+
await ws.send_json({"data": body, "context_id": context_id})
|
487
|
+
try:
|
488
|
+
response = None
|
489
|
+
while True:
|
490
|
+
response = await ws.receive_json()
|
491
|
+
if response["done"]:
|
492
|
+
break
|
493
|
+
|
494
|
+
yield convert_response(response, include_context_id)
|
495
|
+
|
496
|
+
if self.experimental_ws_handle_interrupts:
|
497
|
+
await ws.send_json({"context_id": context_id})
|
498
|
+
except GeneratorExit:
|
499
|
+
# The exit is only called when the generator is garbage collected.
|
500
|
+
# It may not be called directly after a break statement.
|
501
|
+
# However, the generator will be automatically cancelled on the next request.
|
502
|
+
if self.experimental_ws_handle_interrupts:
|
503
|
+
await ws.send_json({"context_id": context_id, "action": "cancel"})
|
504
|
+
except Exception as e:
|
505
|
+
raise RuntimeError(f"Failed to generate audio. {response}") from e
|
506
|
+
|
507
|
+
def _is_websocket_closed(self):
|
508
|
+
return self.websocket.closed
|
509
|
+
|
510
|
+
async def cleanup(self):
|
511
|
+
if self.websocket is not None and not self._is_websocket_closed():
|
512
|
+
await self.websocket.close()
|
513
|
+
if not self._session.closed:
|
514
|
+
await self._session.close()
|
515
|
+
|
516
|
+
def __del__(self):
|
517
|
+
try:
|
518
|
+
loop = asyncio.get_running_loop()
|
519
|
+
except RuntimeError:
|
520
|
+
loop = None
|
521
|
+
|
522
|
+
if loop is None:
|
523
|
+
asyncio.run(self.cleanup())
|
524
|
+
else:
|
525
|
+
loop.create_task(self.cleanup())
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.0.5rc1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5rc1
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -10,8 +10,11 @@ Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
11
|
Requires-Python: >=3.8.0
|
12
12
|
Description-Content-Type: text/markdown
|
13
|
-
Requires-Dist:
|
13
|
+
Requires-Dist: aiohttp
|
14
|
+
Requires-Dist: httpx
|
15
|
+
Requires-Dist: pytest-asyncio
|
14
16
|
Requires-Dist: requests
|
17
|
+
Requires-Dist: websockets
|
15
18
|
Provides-Extra: dev
|
16
19
|
Requires-Dist: pre-commit; extra == "dev"
|
17
20
|
Requires-Dist: docformatter; extra == "dev"
|
@@ -21,6 +24,7 @@ Requires-Dist: flake8==7.0.0; extra == "dev"
|
|
21
24
|
Requires-Dist: flake8-bugbear==24.2.6; extra == "dev"
|
22
25
|
Requires-Dist: pytest>=8.0.2; extra == "dev"
|
23
26
|
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
27
|
+
Requires-Dist: twine; extra == "dev"
|
24
28
|
Provides-Extra: all
|
25
29
|
Requires-Dist: pre-commit; extra == "all"
|
26
30
|
Requires-Dist: docformatter; extra == "all"
|
@@ -30,6 +34,7 @@ Requires-Dist: flake8==7.0.0; extra == "all"
|
|
30
34
|
Requires-Dist: flake8-bugbear==24.2.6; extra == "all"
|
31
35
|
Requires-Dist: pytest>=8.0.2; extra == "all"
|
32
36
|
Requires-Dist: pytest-cov>=4.1.0; extra == "all"
|
37
|
+
Requires-Dist: twine; extra == "all"
|
33
38
|
|
34
39
|
|
35
40
|
# Cartesia Python API Library
|
@@ -104,10 +109,42 @@ for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
|
104
109
|
audio_data.seek(0)
|
105
110
|
|
106
111
|
# Create an Audio object from the BytesIO data
|
107
|
-
audio = Audio(audio_data, rate=output["sampling_rate"])
|
112
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
113
|
+
|
114
|
+
# Display the Audio object
|
115
|
+
display(audio)
|
116
|
+
```
|
117
|
+
|
118
|
+
You can also use the async client if you want to make asynchronous API calls. The usage is very similar:
|
119
|
+
```python
|
120
|
+
from cartesia.tts import AsyncCartesiaTTS
|
121
|
+
from IPython.display import Audio
|
122
|
+
import io
|
123
|
+
import os
|
124
|
+
|
125
|
+
client = AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
126
|
+
voices = client.get_voices()
|
127
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
128
|
+
transcript = "Hello! Welcome to Cartesia"
|
129
|
+
|
130
|
+
# Create a BytesIO object to store the audio data
|
131
|
+
audio_data = io.BytesIO()
|
132
|
+
|
133
|
+
# Generate and stream audio
|
134
|
+
async for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
135
|
+
buffer = output["audio"]
|
136
|
+
audio_data.write(buffer)
|
137
|
+
|
138
|
+
# Set the cursor position to the beginning of the BytesIO object
|
139
|
+
audio_data.seek(0)
|
140
|
+
|
141
|
+
# Create an Audio object from the BytesIO data
|
142
|
+
audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=output["sampling_rate"])
|
108
143
|
|
109
144
|
# Display the Audio object
|
110
145
|
display(audio)
|
111
146
|
```
|
112
147
|
|
113
|
-
|
148
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
149
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
150
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -1,5 +1,8 @@
|
|
1
|
-
|
1
|
+
aiohttp
|
2
|
+
httpx
|
3
|
+
pytest-asyncio
|
2
4
|
requests
|
5
|
+
websockets
|
3
6
|
|
4
7
|
[all]
|
5
8
|
pre-commit
|
@@ -10,6 +13,7 @@ flake8==7.0.0
|
|
10
13
|
flake8-bugbear==24.2.6
|
11
14
|
pytest>=8.0.2
|
12
15
|
pytest-cov>=4.1.0
|
16
|
+
twine
|
13
17
|
|
14
18
|
[dev]
|
15
19
|
pre-commit
|
@@ -20,3 +24,4 @@ flake8==7.0.0
|
|
20
24
|
flake8-bugbear==24.2.6
|
21
25
|
pytest>=8.0.2
|
22
26
|
pytest-cov>=4.1.0
|
27
|
+
twine
|
@@ -78,7 +78,8 @@ class UploadCommand(Command):
|
|
78
78
|
"""Support setup.py upload."""
|
79
79
|
|
80
80
|
description = "Build and publish the package."
|
81
|
-
user_options = []
|
81
|
+
user_options = [("skip-upload", "u", "skip git tagging and pypi upload")]
|
82
|
+
boolean_options = ["skip-upload"]
|
82
83
|
|
83
84
|
@staticmethod
|
84
85
|
def status(s):
|
@@ -86,21 +87,26 @@ class UploadCommand(Command):
|
|
86
87
|
print("\033[1m{0}\033[0m".format(s))
|
87
88
|
|
88
89
|
def initialize_options(self):
|
89
|
-
|
90
|
+
self.skip_upload = False
|
90
91
|
|
91
92
|
def finalize_options(self):
|
92
|
-
|
93
|
+
self.skip_upload = bool(self.skip_upload)
|
93
94
|
|
94
95
|
def run(self):
|
95
96
|
try:
|
96
97
|
self.status("Removing previous builds…")
|
97
98
|
rmtree(os.path.join(here, "dist"))
|
99
|
+
rmtree(os.path.join(here, "build"))
|
98
100
|
except OSError:
|
99
101
|
pass
|
100
102
|
|
101
103
|
self.status("Building Source and Wheel (universal) distribution…")
|
102
104
|
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
|
103
105
|
|
106
|
+
if self.skip_upload:
|
107
|
+
self.status("Skipping git tagging and pypi upload")
|
108
|
+
sys.exit()
|
109
|
+
|
104
110
|
self.status("Uploading the package to PyPI via Twine…")
|
105
111
|
os.system("twine upload dist/*")
|
106
112
|
|
@@ -116,6 +122,9 @@ class BumpVersionCommand(Command):
|
|
116
122
|
To use: python setup.py bumpversion -v <version>
|
117
123
|
|
118
124
|
This command will push the new version directly and tag it.
|
125
|
+
|
126
|
+
Usage:
|
127
|
+
python setup.py bumpversion --version=1.0.1
|
119
128
|
"""
|
120
129
|
|
121
130
|
description = "Installs the foo."
|
@@ -130,6 +139,11 @@ class BumpVersionCommand(Command):
|
|
130
139
|
|
131
140
|
def initialize_options(self):
|
132
141
|
self.version = None
|
142
|
+
self.base_branch = None
|
143
|
+
self.version_branch = None
|
144
|
+
self.updated_files = [
|
145
|
+
"cartesia/version.py",
|
146
|
+
]
|
133
147
|
|
134
148
|
def finalize_options(self):
|
135
149
|
# This package cannot be imported at top level because it
|
@@ -147,14 +161,18 @@ class BumpVersionCommand(Command):
|
|
147
161
|
)
|
148
162
|
|
149
163
|
def _undo(self):
|
150
|
-
os.system(f"git restore --staged {
|
151
|
-
os.system(f"git checkout -- {
|
164
|
+
os.system(f"git restore --staged {' '.join(self.updated_files)}")
|
165
|
+
os.system(f"git checkout -- {' '.join(self.updated_files)}")
|
166
|
+
|
167
|
+
# Return to the original branch
|
168
|
+
os.system(f"git checkout {self.base_branch}")
|
169
|
+
os.system(f"git branch -D {self.version_branch}")
|
152
170
|
|
153
171
|
def run(self):
|
154
172
|
current_version = about["__version__"]
|
155
173
|
|
156
174
|
self.status("Checking current branch is 'main'")
|
157
|
-
current_branch = get_git_branch()
|
175
|
+
self.base_branch = current_branch = get_git_branch()
|
158
176
|
if current_branch != "main":
|
159
177
|
raise RuntimeError(
|
160
178
|
"You can only bump the version from the 'main' branch. "
|
@@ -174,33 +192,43 @@ class BumpVersionCommand(Command):
|
|
174
192
|
|
175
193
|
# TODO: Add check to see if all tests are passing on main.
|
176
194
|
|
195
|
+
# Checkout new branch
|
196
|
+
self.version_branch = f"bumpversion/v{self.version}"
|
197
|
+
self.status(f"Create branch '{self.version_branch}'")
|
198
|
+
err_code = os.system(f"git checkout -b {self.version_branch}")
|
199
|
+
if err_code != 0:
|
200
|
+
raise RuntimeError("Failed to create branch.")
|
201
|
+
|
177
202
|
# Change the version in __init__.py
|
178
203
|
self.status(f"Updating version {current_version} -> {self.version}")
|
179
204
|
update_version(self.version)
|
180
|
-
if current_version != self.version:
|
181
|
-
|
182
|
-
|
205
|
+
# if current_version != self.version:
|
206
|
+
# self._undo()
|
207
|
+
# raise RuntimeError("Failed to update version.")
|
183
208
|
|
184
|
-
self.status(f"Adding {
|
185
|
-
err_code = os.system(f"git add {
|
209
|
+
self.status(f"Adding {', '.join(self.updated_files)} to git")
|
210
|
+
err_code = os.system(f"git add {' '.join(self.updated_files)}")
|
186
211
|
if err_code != 0:
|
187
212
|
self._undo()
|
188
|
-
raise RuntimeError("Failed to add
|
213
|
+
raise RuntimeError("Failed to add files to git.")
|
189
214
|
|
190
215
|
# Commit the file with a message '[bumpversion] v<version>'.
|
191
216
|
self.status(f"Commit with message '[bumpversion] v{self.version}'")
|
192
|
-
err_code = os.system("git commit -m '[bumpversion] v{}'".format(
|
217
|
+
err_code = os.system("git commit -m '[bumpversion] v{}'".format(self.version))
|
193
218
|
if err_code != 0:
|
194
219
|
self._undo()
|
195
220
|
raise RuntimeError("Failed to commit file to git.")
|
196
221
|
|
197
222
|
# Push the commit to origin.
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
223
|
+
self.status(f"Pushing commit to origin/{self.version_branch}")
|
224
|
+
err_code = os.system(f"git push --force --set-upstream origin {self.version_branch}")
|
225
|
+
if err_code != 0:
|
226
|
+
# TODO: undo the commit automatically.
|
227
|
+
self._undo()
|
228
|
+
raise RuntimeError("Failed to push commit to origin.")
|
203
229
|
|
230
|
+
os.system(f"git checkout {self.base_branch}")
|
231
|
+
os.system(f"git branch -D {self.version_branch}")
|
204
232
|
sys.exit()
|
205
233
|
|
206
234
|
|
@@ -6,11 +6,14 @@ but rather for general correctness.
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import os
|
9
|
-
|
9
|
+
import sys
|
10
|
+
import uuid
|
11
|
+
from typing import AsyncGenerator, Dict, Generator, List
|
10
12
|
|
11
13
|
import pytest
|
12
14
|
|
13
|
-
|
15
|
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
16
|
+
from cartesia.tts import DEFAULT_MODEL_ID, AsyncCartesiaTTS, CartesiaTTS, VoiceMetadata
|
14
17
|
|
15
18
|
SAMPLE_VOICE = "Milo"
|
16
19
|
|
@@ -21,9 +24,24 @@ class _Resources:
|
|
21
24
|
self.voices = voices
|
22
25
|
|
23
26
|
|
27
|
+
def create_client():
|
28
|
+
return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
29
|
+
|
30
|
+
|
31
|
+
def create_async_client():
|
32
|
+
return AsyncCartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
33
|
+
|
34
|
+
|
24
35
|
@pytest.fixture(scope="session")
|
25
36
|
def client():
|
26
|
-
return
|
37
|
+
return create_client()
|
38
|
+
|
39
|
+
|
40
|
+
@pytest.fixture(scope="session")
|
41
|
+
def client_with_ws_interrupt():
|
42
|
+
return CartesiaTTS(
|
43
|
+
api_key=os.environ.get("CARTESIA_API_KEY"), experimental_ws_handle_interrupts=True
|
44
|
+
)
|
27
45
|
|
28
46
|
|
29
47
|
@pytest.fixture(scope="session")
|
@@ -93,6 +111,80 @@ def test_generate_stream(resources: _Resources, websocket: bool):
|
|
93
111
|
assert isinstance(output["sampling_rate"], int)
|
94
112
|
|
95
113
|
|
114
|
+
@pytest.mark.parametrize("websocket", [True, False])
|
115
|
+
@pytest.mark.asyncio
|
116
|
+
async def test_async_generate(resources: _Resources, websocket: bool):
|
117
|
+
voices = resources.voices
|
118
|
+
embedding = voices[SAMPLE_VOICE]["embedding"]
|
119
|
+
transcript = "Hello, world!"
|
120
|
+
|
121
|
+
async_client = create_async_client()
|
122
|
+
output = await async_client.generate(
|
123
|
+
transcript=transcript, voice=embedding, websocket=websocket
|
124
|
+
)
|
125
|
+
|
126
|
+
assert output.keys() == {"audio", "sampling_rate"}
|
127
|
+
assert isinstance(output["audio"], bytes)
|
128
|
+
assert isinstance(output["sampling_rate"], int)
|
129
|
+
|
130
|
+
|
131
|
+
@pytest.mark.parametrize("websocket", [True, False])
|
132
|
+
@pytest.mark.asyncio
|
133
|
+
async def test_async_generate_stream(resources: _Resources, websocket: bool):
|
134
|
+
voices = resources.voices
|
135
|
+
embedding = voices[SAMPLE_VOICE]["embedding"]
|
136
|
+
transcript = "Hello, world!"
|
137
|
+
|
138
|
+
async_client = create_async_client()
|
139
|
+
|
140
|
+
generator = await async_client.generate(transcript=transcript, voice=embedding, stream=True)
|
141
|
+
assert isinstance(generator, AsyncGenerator)
|
142
|
+
|
143
|
+
async for output in generator:
|
144
|
+
assert output.keys() == {"audio", "sampling_rate"}
|
145
|
+
assert isinstance(output["audio"], bytes)
|
146
|
+
assert isinstance(output["sampling_rate"], int)
|
147
|
+
|
148
|
+
|
149
|
+
@pytest.mark.parametrize(
|
150
|
+
"actions",
|
151
|
+
[
|
152
|
+
["cancel-5", None],
|
153
|
+
["cancel-5", "cancel-1", None],
|
154
|
+
[None, "cancel-3", None],
|
155
|
+
[None, "cancel-1", "cancel-2"],
|
156
|
+
],
|
157
|
+
)
|
158
|
+
def test_generate_stream_interrupt(
|
159
|
+
client_with_ws_interrupt: CartesiaTTS, resources: _Resources, actions: List[str]
|
160
|
+
):
|
161
|
+
client = client_with_ws_interrupt
|
162
|
+
voices = resources.voices
|
163
|
+
embedding = voices[SAMPLE_VOICE]["embedding"]
|
164
|
+
transcript = "Hello, world!"
|
165
|
+
|
166
|
+
context_ids = [f"test-{uuid.uuid4().hex[:6]}" for _ in range(len(actions))]
|
167
|
+
|
168
|
+
for context_id, action in zip(context_ids, actions):
|
169
|
+
body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=embedding)
|
170
|
+
|
171
|
+
# Parse actions to see what we should expect.
|
172
|
+
if action is None:
|
173
|
+
num_turns = None
|
174
|
+
elif "cancel" in action:
|
175
|
+
num_turns = int(action.split("-")[1])
|
176
|
+
|
177
|
+
generator = client._generate_ws(body, context_id=context_id)
|
178
|
+
for idx, response in enumerate(generator):
|
179
|
+
assert response.keys() == {"audio", "sampling_rate", "context_id"}
|
180
|
+
assert response["context_id"] == context_id, (
|
181
|
+
f"Context ID from response ({response['context_id']}) does not match "
|
182
|
+
f"the expected context ID ({context_id})"
|
183
|
+
)
|
184
|
+
if idx + 1 == num_turns:
|
185
|
+
break
|
186
|
+
|
187
|
+
|
96
188
|
@pytest.mark.parametrize("chunk_time", [0.05, 0.6])
|
97
189
|
def test_check_inputs_invalid_chunk_time(client: CartesiaTTS, chunk_time):
|
98
190
|
with pytest.raises(ValueError, match="`chunk_time` must be between 0.1 and 0.5"):
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "0.0.3"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|