cartesia 0.0.2__py2.py3-none-any.whl → 0.0.4__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/tts.py +74 -29
- cartesia/version.py +1 -1
- {cartesia-0.0.2.dist-info → cartesia-0.0.4.dist-info}/METADATA +57 -12
- cartesia-0.0.4.dist-info/RECORD +7 -0
- cartesia-0.0.2.dist-info/RECORD +0 -7
- {cartesia-0.0.2.dist-info → cartesia-0.0.4.dist-info}/WHEEL +0 -0
- {cartesia-0.0.2.dist-info → cartesia-0.0.4.dist-info}/top_level.txt +0 -0
cartesia/tts.py
CHANGED
@@ -4,7 +4,6 @@ import os
|
|
4
4
|
import uuid
|
5
5
|
from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
import requests
|
9
8
|
from websockets.sync.client import connect
|
10
9
|
|
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
|
|
14
13
|
|
15
14
|
|
16
15
|
class AudioOutput(TypedDict):
|
17
|
-
audio:
|
16
|
+
audio: bytes
|
18
17
|
sampling_rate: int
|
19
18
|
|
20
19
|
|
@@ -32,7 +31,11 @@ class CartesiaTTS:
|
|
32
31
|
"""The client for Cartesia's text-to-speech library.
|
33
32
|
|
34
33
|
This client contains methods to interact with the Cartesia text-to-speech API.
|
35
|
-
The
|
34
|
+
The client can be used to retrieve available voices, compute new voice embeddings,
|
35
|
+
and generate speech from text.
|
36
|
+
|
37
|
+
The client also supports generating audio using a websocket for lower latency.
|
38
|
+
To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
|
36
39
|
|
37
40
|
Examples:
|
38
41
|
|
@@ -56,18 +59,22 @@ class CartesiaTTS:
|
|
56
59
|
... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
|
57
60
|
"""
|
58
61
|
|
59
|
-
def __init__(self, *, api_key: str = None):
|
62
|
+
def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
|
60
63
|
"""
|
61
64
|
Args:
|
62
65
|
api_key: The API key to use for authorization.
|
63
66
|
If not specified, the API key will be read from the environment variable
|
64
67
|
`CARTESIA_API_KEY`.
|
68
|
+
experimental_ws_handle_interrupts: Whether to handle interrupts when generating
|
69
|
+
audio using the websocket. This is an experimental feature and may have bugs
|
70
|
+
or be deprecated in the future.
|
65
71
|
"""
|
66
72
|
self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
|
67
73
|
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
68
74
|
self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
|
69
75
|
self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
|
70
76
|
self.websocket = None
|
77
|
+
self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
|
71
78
|
self.refresh_websocket()
|
72
79
|
|
73
80
|
def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
|
@@ -168,21 +175,37 @@ class CartesiaTTS:
|
|
168
175
|
"""
|
169
176
|
if self.websocket and not self._is_websocket_closed():
|
170
177
|
self.websocket.close()
|
178
|
+
route = "audio/websocket"
|
179
|
+
if self.experimental_ws_handle_interrupts:
|
180
|
+
route = f"experimental/{route}"
|
171
181
|
self.websocket = connect(
|
172
|
-
f"{self._ws_url()}/
|
182
|
+
f"{self._ws_url()}/{route}?api_key={self.api_key}",
|
173
183
|
close_timeout=None,
|
174
184
|
)
|
175
185
|
|
176
186
|
def _is_websocket_closed(self):
|
177
187
|
return self.websocket.socket.fileno() == -1
|
178
188
|
|
189
|
+
def _check_inputs(
|
190
|
+
self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
|
191
|
+
):
|
192
|
+
if chunk_time is not None:
|
193
|
+
if chunk_time < 0.1 or chunk_time > 0.5:
|
194
|
+
raise ValueError("`chunk_time` must be between 0.1 and 0.5")
|
195
|
+
|
196
|
+
if chunk_time is not None and duration is not None:
|
197
|
+
if duration < chunk_time:
|
198
|
+
raise ValueError("`duration` must be greater than chunk_time")
|
199
|
+
|
200
|
+
if transcript.strip() == "":
|
201
|
+
raise ValueError("`transcript` must be non empty")
|
202
|
+
|
179
203
|
def generate(
|
180
204
|
self,
|
181
205
|
*,
|
182
206
|
transcript: str,
|
183
207
|
duration: int = None,
|
184
208
|
chunk_time: float = None,
|
185
|
-
lookahead: int = None,
|
186
209
|
voice: Embedding = None,
|
187
210
|
stream: bool = False,
|
188
211
|
websocket: bool = True,
|
@@ -194,8 +217,6 @@ class CartesiaTTS:
|
|
194
217
|
duration: The maximum duration of the audio in seconds.
|
195
218
|
chunk_time: How long each audio segment should be in seconds.
|
196
219
|
This should not need to be adjusted.
|
197
|
-
lookahead: The number of seconds to look ahead for each chunk.
|
198
|
-
This should not need to be adjusted.
|
199
220
|
voice: The voice to use for generating audio.
|
200
221
|
This can either be a voice id (string) or an embedding vector (List[float]).
|
201
222
|
stream: Whether to stream the audio or not.
|
@@ -206,18 +227,16 @@ class CartesiaTTS:
|
|
206
227
|
Returns:
|
207
228
|
A generator if `stream` is True, otherwise a dictionary.
|
208
229
|
Dictionary from both generator and non-generator return types have the following keys:
|
209
|
-
* "audio": The audio as a
|
230
|
+
* "audio": The audio as a bytes buffer.
|
210
231
|
* "sampling_rate": The sampling rate of the audio.
|
211
232
|
"""
|
212
|
-
|
233
|
+
self._check_inputs(transcript, duration, chunk_time)
|
213
234
|
|
214
|
-
|
215
|
-
voice = self._voices[voice]
|
235
|
+
body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
|
216
236
|
|
217
237
|
optional_body = dict(
|
218
238
|
duration=duration,
|
219
239
|
chunk_time=chunk_time,
|
220
|
-
lookahead=lookahead,
|
221
240
|
voice=voice,
|
222
241
|
)
|
223
242
|
body.update({k: v for k, v in optional_body.items() if v is not None})
|
@@ -237,7 +256,7 @@ class CartesiaTTS:
|
|
237
256
|
sampling_rate = chunk["sampling_rate"]
|
238
257
|
chunks.append(chunk["audio"])
|
239
258
|
|
240
|
-
return {"audio":
|
259
|
+
return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
|
241
260
|
|
242
261
|
def _generate_http(self, body: Dict[str, Any]):
|
243
262
|
response = requests.post(
|
@@ -258,8 +277,7 @@ class CartesiaTTS:
|
|
258
277
|
if start_index != -1 and end_index != -1:
|
259
278
|
try:
|
260
279
|
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
261
|
-
|
262
|
-
audio = np.frombuffer(data, dtype=np.float32)
|
280
|
+
audio = base64.b64decode(chunk_json["data"])
|
263
281
|
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
264
282
|
buffer = buffer[end_index + 1 :]
|
265
283
|
except json.JSONDecodeError:
|
@@ -268,28 +286,55 @@ class CartesiaTTS:
|
|
268
286
|
if buffer:
|
269
287
|
try:
|
270
288
|
chunk_json = json.loads(buffer)
|
271
|
-
|
272
|
-
audio = np.frombuffer(data, dtype=np.float32)
|
289
|
+
audio = base64.b64decode(chunk_json["data"])
|
273
290
|
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
274
291
|
except json.JSONDecodeError:
|
275
292
|
pass
|
276
293
|
|
277
|
-
def _generate_ws(self, body: Dict[str, Any]):
|
294
|
+
def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
|
295
|
+
"""Generate audio using the websocket connection.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
body: The request body.
|
299
|
+
context_id: The context id for the request.
|
300
|
+
The context id must be globally unique for the duration this client exists.
|
301
|
+
If this is provided, the context id that is in the response will
|
302
|
+
also be returned as part of the dict. This is helpful for testing.
|
303
|
+
"""
|
278
304
|
if not self.websocket or self._is_websocket_closed():
|
279
305
|
self.refresh_websocket()
|
280
306
|
|
281
|
-
|
307
|
+
include_context_id = bool(context_id)
|
308
|
+
if context_id is None:
|
309
|
+
context_id = uuid.uuid4().hex
|
310
|
+
self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
|
282
311
|
try:
|
283
|
-
|
284
|
-
while not response["done"]:
|
285
|
-
data = base64.b64decode(response["data"])
|
286
|
-
audio = np.frombuffer(data, dtype=np.float32)
|
287
|
-
# print("timing", time.perf_counter() - start)
|
288
|
-
yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
|
289
|
-
|
312
|
+
while True:
|
290
313
|
response = json.loads(self.websocket.recv())
|
291
|
-
|
292
|
-
|
314
|
+
if response["done"]:
|
315
|
+
break
|
316
|
+
audio = base64.b64decode(response["data"])
|
317
|
+
|
318
|
+
optional_kwargs = {}
|
319
|
+
if include_context_id:
|
320
|
+
optional_kwargs["context_id"] = response["context_id"]
|
321
|
+
|
322
|
+
yield {
|
323
|
+
"audio": audio,
|
324
|
+
"sampling_rate": response["sampling_rate"],
|
325
|
+
**optional_kwargs,
|
326
|
+
}
|
327
|
+
|
328
|
+
if self.experimental_ws_handle_interrupts:
|
329
|
+
self.websocket.send(json.dumps({"context_id": context_id}))
|
330
|
+
except GeneratorExit:
|
331
|
+
# The exit is only called when the generator is garbage collected.
|
332
|
+
# It may not be called directly after a break statement.
|
333
|
+
# However, the generator will be automatically cancelled on the next request.
|
334
|
+
if self.experimental_ws_handle_interrupts:
|
335
|
+
self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
|
336
|
+
except Exception as e:
|
337
|
+
raise RuntimeError(f"Failed to generate audio. {response}") from e
|
293
338
|
|
294
339
|
def _http_url(self):
|
295
340
|
prefix = "http" if "localhost" in self.base_url else "https"
|
cartesia/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.
|
1
|
+
__version__ = "0.0.4"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
Requires-Dist: websockets
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: numpy
|
16
15
|
Provides-Extra: all
|
17
16
|
Requires-Dist: pre-commit ; extra == 'all'
|
18
17
|
Requires-Dist: docformatter ; extra == 'all'
|
@@ -49,22 +48,68 @@ pip install -e '.[dev]'
|
|
49
48
|
## Usage
|
50
49
|
```python
|
51
50
|
from cartesia.tts import CartesiaTTS
|
52
|
-
|
51
|
+
import pyaudio
|
52
|
+
import os
|
53
53
|
|
54
54
|
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
55
|
-
|
56
55
|
voices = client.get_voices()
|
57
|
-
|
56
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
58
57
|
transcript = "Hello! Welcome to Cartesia"
|
59
58
|
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
p = pyaudio.PyAudio()
|
60
|
+
|
61
|
+
stream = None
|
63
62
|
|
64
|
-
#
|
65
|
-
for output in client.generate(transcript=transcript, voice=
|
66
|
-
|
63
|
+
# Generate and stream audio
|
64
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
65
|
+
buffer = output["audio"]
|
67
66
|
rate = output["sampling_rate"]
|
67
|
+
|
68
|
+
if not stream:
|
69
|
+
stream = p.open(format=pyaudio.paFloat32,
|
70
|
+
channels=1,
|
71
|
+
rate=rate,
|
72
|
+
output=True)
|
73
|
+
|
74
|
+
# Write the audio data to the stream
|
75
|
+
stream.write(buffer)
|
76
|
+
|
77
|
+
stream.stop_stream()
|
78
|
+
stream.close()
|
79
|
+
p.terminate()
|
80
|
+
```
|
81
|
+
|
82
|
+
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
|
83
|
+
|
84
|
+
```python
|
85
|
+
from cartesia.tts import CartesiaTTS
|
86
|
+
from IPython.display import Audio
|
87
|
+
import io
|
88
|
+
import os
|
89
|
+
|
90
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
91
|
+
voices = client.get_voices()
|
92
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
93
|
+
transcript = "Hello! Welcome to Cartesia"
|
94
|
+
|
95
|
+
# Create a BytesIO object to store the audio data
|
96
|
+
audio_data = io.BytesIO()
|
97
|
+
|
98
|
+
# Generate and stream audio
|
99
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
100
|
+
buffer = output["audio"]
|
101
|
+
audio_data.write(buffer)
|
102
|
+
|
103
|
+
# Set the cursor position to the beginning of the BytesIO object
|
104
|
+
audio_data.seek(0)
|
105
|
+
|
106
|
+
# Create an Audio object from the BytesIO data
|
107
|
+
audio = Audio(audio_data, rate=output["sampling_rate"])
|
108
|
+
|
109
|
+
# Display the Audio object
|
110
|
+
display(audio)
|
68
111
|
```
|
69
112
|
|
70
|
-
|
113
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
114
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
115
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
|
2
|
+
cartesia/tts.py,sha256=9m9_kqscMY0yzUU0Ty5k2HoeMqfrIbHouaS-ymcr64s,14127
|
3
|
+
cartesia/version.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
|
4
|
+
cartesia-0.0.4.dist-info/METADATA,sha256=tLUrKLREJiXrW-pfd3k61i9CnElKHk5RAyidCMxpR-s,3752
|
5
|
+
cartesia-0.0.4.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
6
|
+
cartesia-0.0.4.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
7
|
+
cartesia-0.0.4.dist-info/RECORD,,
|
cartesia-0.0.2.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
|
2
|
-
cartesia/tts.py,sha256=kQjkQhWfVrLFH6yaOb6G65HXtFDzPwLa6Q9AwVgIyCI,11901
|
3
|
-
cartesia/version.py,sha256=QvlVh4JTl3JL7jQAja76yKtT-IvF4631ASjWY1wS6AQ,22
|
4
|
-
cartesia-0.0.2.dist-info/METADATA,sha256=7BcDRyB4vxCWxcJhTbe_cWEQXCNOiEJdqoWh8WouNGs,2465
|
5
|
-
cartesia-0.0.2.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
6
|
-
cartesia-0.0.2.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
7
|
-
cartesia-0.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|