cartesia 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-0.0.2 → cartesia-0.0.4}/PKG-INFO +57 -12
- cartesia-0.0.4/README.md +81 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia/tts.py +74 -29
- cartesia-0.0.4/cartesia/version.py +1 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia.egg-info/PKG-INFO +57 -12
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia.egg-info/requires.txt +0 -1
- {cartesia-0.0.2 → cartesia-0.0.4}/setup.py +45 -17
- cartesia-0.0.4/tests/test_tts.py +180 -0
- cartesia-0.0.2/README.md +0 -35
- cartesia-0.0.2/cartesia/version.py +0 -1
- cartesia-0.0.2/tests/test_tts.py +0 -96
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia/__init__.py +0 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia.egg-info/SOURCES.txt +0 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/pyproject.toml +0 -0
- {cartesia-0.0.2 → cartesia-0.0.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
Requires-Dist: websockets
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: numpy
|
16
15
|
Provides-Extra: dev
|
17
16
|
Requires-Dist: pre-commit; extra == "dev"
|
18
17
|
Requires-Dist: docformatter; extra == "dev"
|
@@ -49,22 +48,68 @@ pip install -e '.[dev]'
|
|
49
48
|
## Usage
|
50
49
|
```python
|
51
50
|
from cartesia.tts import CartesiaTTS
|
52
|
-
|
51
|
+
import pyaudio
|
52
|
+
import os
|
53
53
|
|
54
54
|
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
55
|
-
|
56
55
|
voices = client.get_voices()
|
57
|
-
|
56
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
58
57
|
transcript = "Hello! Welcome to Cartesia"
|
59
58
|
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
p = pyaudio.PyAudio()
|
60
|
+
|
61
|
+
stream = None
|
63
62
|
|
64
|
-
#
|
65
|
-
for output in client.generate(transcript=transcript, voice=
|
66
|
-
|
63
|
+
# Generate and stream audio
|
64
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
65
|
+
buffer = output["audio"]
|
67
66
|
rate = output["sampling_rate"]
|
67
|
+
|
68
|
+
if not stream:
|
69
|
+
stream = p.open(format=pyaudio.paFloat32,
|
70
|
+
channels=1,
|
71
|
+
rate=rate,
|
72
|
+
output=True)
|
73
|
+
|
74
|
+
# Write the audio data to the stream
|
75
|
+
stream.write(buffer)
|
76
|
+
|
77
|
+
stream.stop_stream()
|
78
|
+
stream.close()
|
79
|
+
p.terminate()
|
80
|
+
```
|
81
|
+
|
82
|
+
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
|
83
|
+
|
84
|
+
```python
|
85
|
+
from cartesia.tts import CartesiaTTS
|
86
|
+
from IPython.display import Audio
|
87
|
+
import io
|
88
|
+
import os
|
89
|
+
|
90
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
91
|
+
voices = client.get_voices()
|
92
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
93
|
+
transcript = "Hello! Welcome to Cartesia"
|
94
|
+
|
95
|
+
# Create a BytesIO object to store the audio data
|
96
|
+
audio_data = io.BytesIO()
|
97
|
+
|
98
|
+
# Generate and stream audio
|
99
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
100
|
+
buffer = output["audio"]
|
101
|
+
audio_data.write(buffer)
|
102
|
+
|
103
|
+
# Set the cursor position to the beginning of the BytesIO object
|
104
|
+
audio_data.seek(0)
|
105
|
+
|
106
|
+
# Create an Audio object from the BytesIO data
|
107
|
+
audio = Audio(audio_data, rate=output["sampling_rate"])
|
108
|
+
|
109
|
+
# Display the Audio object
|
110
|
+
display(audio)
|
68
111
|
```
|
69
112
|
|
70
|
-
|
113
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
114
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
115
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
cartesia-0.0.4/README.md
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# Cartesia Python API Library
|
2
|
+
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
3
|
+
|
4
|
+
**Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
```bash
|
8
|
+
pip install cartesia
|
9
|
+
|
10
|
+
# pip install in editable mode w/ dev dependencies
|
11
|
+
pip install -e '.[dev]'
|
12
|
+
```
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
```python
|
16
|
+
from cartesia.tts import CartesiaTTS
|
17
|
+
import pyaudio
|
18
|
+
import os
|
19
|
+
|
20
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
21
|
+
voices = client.get_voices()
|
22
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
23
|
+
transcript = "Hello! Welcome to Cartesia"
|
24
|
+
|
25
|
+
p = pyaudio.PyAudio()
|
26
|
+
|
27
|
+
stream = None
|
28
|
+
|
29
|
+
# Generate and stream audio
|
30
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
31
|
+
buffer = output["audio"]
|
32
|
+
rate = output["sampling_rate"]
|
33
|
+
|
34
|
+
if not stream:
|
35
|
+
stream = p.open(format=pyaudio.paFloat32,
|
36
|
+
channels=1,
|
37
|
+
rate=rate,
|
38
|
+
output=True)
|
39
|
+
|
40
|
+
# Write the audio data to the stream
|
41
|
+
stream.write(buffer)
|
42
|
+
|
43
|
+
stream.stop_stream()
|
44
|
+
stream.close()
|
45
|
+
p.terminate()
|
46
|
+
```
|
47
|
+
|
48
|
+
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
|
49
|
+
|
50
|
+
```python
|
51
|
+
from cartesia.tts import CartesiaTTS
|
52
|
+
from IPython.display import Audio
|
53
|
+
import io
|
54
|
+
import os
|
55
|
+
|
56
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
57
|
+
voices = client.get_voices()
|
58
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
59
|
+
transcript = "Hello! Welcome to Cartesia"
|
60
|
+
|
61
|
+
# Create a BytesIO object to store the audio data
|
62
|
+
audio_data = io.BytesIO()
|
63
|
+
|
64
|
+
# Generate and stream audio
|
65
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
66
|
+
buffer = output["audio"]
|
67
|
+
audio_data.write(buffer)
|
68
|
+
|
69
|
+
# Set the cursor position to the beginning of the BytesIO object
|
70
|
+
audio_data.seek(0)
|
71
|
+
|
72
|
+
# Create an Audio object from the BytesIO data
|
73
|
+
audio = Audio(audio_data, rate=output["sampling_rate"])
|
74
|
+
|
75
|
+
# Display the Audio object
|
76
|
+
display(audio)
|
77
|
+
```
|
78
|
+
|
79
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
80
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
81
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -4,7 +4,6 @@ import os
|
|
4
4
|
import uuid
|
5
5
|
from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
import requests
|
9
8
|
from websockets.sync.client import connect
|
10
9
|
|
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
|
|
14
13
|
|
15
14
|
|
16
15
|
class AudioOutput(TypedDict):
|
17
|
-
audio:
|
16
|
+
audio: bytes
|
18
17
|
sampling_rate: int
|
19
18
|
|
20
19
|
|
@@ -32,7 +31,11 @@ class CartesiaTTS:
|
|
32
31
|
"""The client for Cartesia's text-to-speech library.
|
33
32
|
|
34
33
|
This client contains methods to interact with the Cartesia text-to-speech API.
|
35
|
-
The
|
34
|
+
The client can be used to retrieve available voices, compute new voice embeddings,
|
35
|
+
and generate speech from text.
|
36
|
+
|
37
|
+
The client also supports generating audio using a websocket for lower latency.
|
38
|
+
To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
|
36
39
|
|
37
40
|
Examples:
|
38
41
|
|
@@ -56,18 +59,22 @@ class CartesiaTTS:
|
|
56
59
|
... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
|
57
60
|
"""
|
58
61
|
|
59
|
-
def __init__(self, *, api_key: str = None):
|
62
|
+
def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
|
60
63
|
"""
|
61
64
|
Args:
|
62
65
|
api_key: The API key to use for authorization.
|
63
66
|
If not specified, the API key will be read from the environment variable
|
64
67
|
`CARTESIA_API_KEY`.
|
68
|
+
experimental_ws_handle_interrupts: Whether to handle interrupts when generating
|
69
|
+
audio using the websocket. This is an experimental feature and may have bugs
|
70
|
+
or be deprecated in the future.
|
65
71
|
"""
|
66
72
|
self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
|
67
73
|
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
68
74
|
self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
|
69
75
|
self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
|
70
76
|
self.websocket = None
|
77
|
+
self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
|
71
78
|
self.refresh_websocket()
|
72
79
|
|
73
80
|
def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
|
@@ -168,21 +175,37 @@ class CartesiaTTS:
|
|
168
175
|
"""
|
169
176
|
if self.websocket and not self._is_websocket_closed():
|
170
177
|
self.websocket.close()
|
178
|
+
route = "audio/websocket"
|
179
|
+
if self.experimental_ws_handle_interrupts:
|
180
|
+
route = f"experimental/{route}"
|
171
181
|
self.websocket = connect(
|
172
|
-
f"{self._ws_url()}/
|
182
|
+
f"{self._ws_url()}/{route}?api_key={self.api_key}",
|
173
183
|
close_timeout=None,
|
174
184
|
)
|
175
185
|
|
176
186
|
def _is_websocket_closed(self):
|
177
187
|
return self.websocket.socket.fileno() == -1
|
178
188
|
|
189
|
+
def _check_inputs(
|
190
|
+
self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
|
191
|
+
):
|
192
|
+
if chunk_time is not None:
|
193
|
+
if chunk_time < 0.1 or chunk_time > 0.5:
|
194
|
+
raise ValueError("`chunk_time` must be between 0.1 and 0.5")
|
195
|
+
|
196
|
+
if chunk_time is not None and duration is not None:
|
197
|
+
if duration < chunk_time:
|
198
|
+
raise ValueError("`duration` must be greater than chunk_time")
|
199
|
+
|
200
|
+
if transcript.strip() == "":
|
201
|
+
raise ValueError("`transcript` must be non empty")
|
202
|
+
|
179
203
|
def generate(
|
180
204
|
self,
|
181
205
|
*,
|
182
206
|
transcript: str,
|
183
207
|
duration: int = None,
|
184
208
|
chunk_time: float = None,
|
185
|
-
lookahead: int = None,
|
186
209
|
voice: Embedding = None,
|
187
210
|
stream: bool = False,
|
188
211
|
websocket: bool = True,
|
@@ -194,8 +217,6 @@ class CartesiaTTS:
|
|
194
217
|
duration: The maximum duration of the audio in seconds.
|
195
218
|
chunk_time: How long each audio segment should be in seconds.
|
196
219
|
This should not need to be adjusted.
|
197
|
-
lookahead: The number of seconds to look ahead for each chunk.
|
198
|
-
This should not need to be adjusted.
|
199
220
|
voice: The voice to use for generating audio.
|
200
221
|
This can either be a voice id (string) or an embedding vector (List[float]).
|
201
222
|
stream: Whether to stream the audio or not.
|
@@ -206,18 +227,16 @@ class CartesiaTTS:
|
|
206
227
|
Returns:
|
207
228
|
A generator if `stream` is True, otherwise a dictionary.
|
208
229
|
Dictionary from both generator and non-generator return types have the following keys:
|
209
|
-
* "audio": The audio as a
|
230
|
+
* "audio": The audio as a bytes buffer.
|
210
231
|
* "sampling_rate": The sampling rate of the audio.
|
211
232
|
"""
|
212
|
-
|
233
|
+
self._check_inputs(transcript, duration, chunk_time)
|
213
234
|
|
214
|
-
|
215
|
-
voice = self._voices[voice]
|
235
|
+
body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
|
216
236
|
|
217
237
|
optional_body = dict(
|
218
238
|
duration=duration,
|
219
239
|
chunk_time=chunk_time,
|
220
|
-
lookahead=lookahead,
|
221
240
|
voice=voice,
|
222
241
|
)
|
223
242
|
body.update({k: v for k, v in optional_body.items() if v is not None})
|
@@ -237,7 +256,7 @@ class CartesiaTTS:
|
|
237
256
|
sampling_rate = chunk["sampling_rate"]
|
238
257
|
chunks.append(chunk["audio"])
|
239
258
|
|
240
|
-
return {"audio":
|
259
|
+
return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
|
241
260
|
|
242
261
|
def _generate_http(self, body: Dict[str, Any]):
|
243
262
|
response = requests.post(
|
@@ -258,8 +277,7 @@ class CartesiaTTS:
|
|
258
277
|
if start_index != -1 and end_index != -1:
|
259
278
|
try:
|
260
279
|
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
261
|
-
|
262
|
-
audio = np.frombuffer(data, dtype=np.float32)
|
280
|
+
audio = base64.b64decode(chunk_json["data"])
|
263
281
|
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
264
282
|
buffer = buffer[end_index + 1 :]
|
265
283
|
except json.JSONDecodeError:
|
@@ -268,28 +286,55 @@ class CartesiaTTS:
|
|
268
286
|
if buffer:
|
269
287
|
try:
|
270
288
|
chunk_json = json.loads(buffer)
|
271
|
-
|
272
|
-
audio = np.frombuffer(data, dtype=np.float32)
|
289
|
+
audio = base64.b64decode(chunk_json["data"])
|
273
290
|
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
274
291
|
except json.JSONDecodeError:
|
275
292
|
pass
|
276
293
|
|
277
|
-
def _generate_ws(self, body: Dict[str, Any]):
|
294
|
+
def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
|
295
|
+
"""Generate audio using the websocket connection.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
body: The request body.
|
299
|
+
context_id: The context id for the request.
|
300
|
+
The context id must be globally unique for the duration this client exists.
|
301
|
+
If this is provided, the context id that is in the response will
|
302
|
+
also be returned as part of the dict. This is helpful for testing.
|
303
|
+
"""
|
278
304
|
if not self.websocket or self._is_websocket_closed():
|
279
305
|
self.refresh_websocket()
|
280
306
|
|
281
|
-
|
307
|
+
include_context_id = bool(context_id)
|
308
|
+
if context_id is None:
|
309
|
+
context_id = uuid.uuid4().hex
|
310
|
+
self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
|
282
311
|
try:
|
283
|
-
|
284
|
-
while not response["done"]:
|
285
|
-
data = base64.b64decode(response["data"])
|
286
|
-
audio = np.frombuffer(data, dtype=np.float32)
|
287
|
-
# print("timing", time.perf_counter() - start)
|
288
|
-
yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
|
289
|
-
|
312
|
+
while True:
|
290
313
|
response = json.loads(self.websocket.recv())
|
291
|
-
|
292
|
-
|
314
|
+
if response["done"]:
|
315
|
+
break
|
316
|
+
audio = base64.b64decode(response["data"])
|
317
|
+
|
318
|
+
optional_kwargs = {}
|
319
|
+
if include_context_id:
|
320
|
+
optional_kwargs["context_id"] = response["context_id"]
|
321
|
+
|
322
|
+
yield {
|
323
|
+
"audio": audio,
|
324
|
+
"sampling_rate": response["sampling_rate"],
|
325
|
+
**optional_kwargs,
|
326
|
+
}
|
327
|
+
|
328
|
+
if self.experimental_ws_handle_interrupts:
|
329
|
+
self.websocket.send(json.dumps({"context_id": context_id}))
|
330
|
+
except GeneratorExit:
|
331
|
+
# The exit is only called when the generator is garbage collected.
|
332
|
+
# It may not be called directly after a break statement.
|
333
|
+
# However, the generator will be automatically cancelled on the next request.
|
334
|
+
if self.experimental_ws_handle_interrupts:
|
335
|
+
self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
|
336
|
+
except Exception as e:
|
337
|
+
raise RuntimeError(f"Failed to generate audio. {response}") from e
|
293
338
|
|
294
339
|
def _http_url(self):
|
295
340
|
prefix = "http" if "localhost" in self.base_url else "https"
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.0.4"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
Requires-Dist: websockets
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: numpy
|
16
15
|
Provides-Extra: dev
|
17
16
|
Requires-Dist: pre-commit; extra == "dev"
|
18
17
|
Requires-Dist: docformatter; extra == "dev"
|
@@ -49,22 +48,68 @@ pip install -e '.[dev]'
|
|
49
48
|
## Usage
|
50
49
|
```python
|
51
50
|
from cartesia.tts import CartesiaTTS
|
52
|
-
|
51
|
+
import pyaudio
|
52
|
+
import os
|
53
53
|
|
54
54
|
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
55
|
-
|
56
55
|
voices = client.get_voices()
|
57
|
-
|
56
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
58
57
|
transcript = "Hello! Welcome to Cartesia"
|
59
58
|
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
p = pyaudio.PyAudio()
|
60
|
+
|
61
|
+
stream = None
|
63
62
|
|
64
|
-
#
|
65
|
-
for output in client.generate(transcript=transcript, voice=
|
66
|
-
|
63
|
+
# Generate and stream audio
|
64
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
65
|
+
buffer = output["audio"]
|
67
66
|
rate = output["sampling_rate"]
|
67
|
+
|
68
|
+
if not stream:
|
69
|
+
stream = p.open(format=pyaudio.paFloat32,
|
70
|
+
channels=1,
|
71
|
+
rate=rate,
|
72
|
+
output=True)
|
73
|
+
|
74
|
+
# Write the audio data to the stream
|
75
|
+
stream.write(buffer)
|
76
|
+
|
77
|
+
stream.stop_stream()
|
78
|
+
stream.close()
|
79
|
+
p.terminate()
|
80
|
+
```
|
81
|
+
|
82
|
+
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
|
83
|
+
|
84
|
+
```python
|
85
|
+
from cartesia.tts import CartesiaTTS
|
86
|
+
from IPython.display import Audio
|
87
|
+
import io
|
88
|
+
import os
|
89
|
+
|
90
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
91
|
+
voices = client.get_voices()
|
92
|
+
voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
|
93
|
+
transcript = "Hello! Welcome to Cartesia"
|
94
|
+
|
95
|
+
# Create a BytesIO object to store the audio data
|
96
|
+
audio_data = io.BytesIO()
|
97
|
+
|
98
|
+
# Generate and stream audio
|
99
|
+
for output in client.generate(transcript=transcript, voice=voice, stream=True):
|
100
|
+
buffer = output["audio"]
|
101
|
+
audio_data.write(buffer)
|
102
|
+
|
103
|
+
# Set the cursor position to the beginning of the BytesIO object
|
104
|
+
audio_data.seek(0)
|
105
|
+
|
106
|
+
# Create an Audio object from the BytesIO data
|
107
|
+
audio = Audio(audio_data, rate=output["sampling_rate"])
|
108
|
+
|
109
|
+
# Display the Audio object
|
110
|
+
display(audio)
|
68
111
|
```
|
69
112
|
|
70
|
-
|
113
|
+
To avoid storing your API key in the source code, we recommend doing one of the following:
|
114
|
+
1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
|
115
|
+
1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
|
@@ -78,7 +78,8 @@ class UploadCommand(Command):
|
|
78
78
|
"""Support setup.py upload."""
|
79
79
|
|
80
80
|
description = "Build and publish the package."
|
81
|
-
user_options = []
|
81
|
+
user_options = [("skip-upload", "u", "skip git tagging and pypi upload")]
|
82
|
+
boolean_options = ["skip-upload"]
|
82
83
|
|
83
84
|
@staticmethod
|
84
85
|
def status(s):
|
@@ -86,21 +87,26 @@ class UploadCommand(Command):
|
|
86
87
|
print("\033[1m{0}\033[0m".format(s))
|
87
88
|
|
88
89
|
def initialize_options(self):
|
89
|
-
|
90
|
+
self.skip_upload = False
|
90
91
|
|
91
92
|
def finalize_options(self):
|
92
|
-
|
93
|
+
self.skip_upload = bool(self.skip_upload)
|
93
94
|
|
94
95
|
def run(self):
|
95
96
|
try:
|
96
97
|
self.status("Removing previous builds…")
|
97
98
|
rmtree(os.path.join(here, "dist"))
|
99
|
+
rmtree(os.path.join(here, "build"))
|
98
100
|
except OSError:
|
99
101
|
pass
|
100
102
|
|
101
103
|
self.status("Building Source and Wheel (universal) distribution…")
|
102
104
|
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
|
103
105
|
|
106
|
+
if self.skip_upload:
|
107
|
+
self.status("Skipping git tagging and pypi upload")
|
108
|
+
sys.exit()
|
109
|
+
|
104
110
|
self.status("Uploading the package to PyPI via Twine…")
|
105
111
|
os.system("twine upload dist/*")
|
106
112
|
|
@@ -116,6 +122,9 @@ class BumpVersionCommand(Command):
|
|
116
122
|
To use: python setup.py bumpversion -v <version>
|
117
123
|
|
118
124
|
This command will push the new version directly and tag it.
|
125
|
+
|
126
|
+
Usage:
|
127
|
+
python setup.py bumpversion --version=1.0.1
|
119
128
|
"""
|
120
129
|
|
121
130
|
description = "Installs the foo."
|
@@ -130,6 +139,11 @@ class BumpVersionCommand(Command):
|
|
130
139
|
|
131
140
|
def initialize_options(self):
|
132
141
|
self.version = None
|
142
|
+
self.base_branch = None
|
143
|
+
self.version_branch = None
|
144
|
+
self.updated_files = [
|
145
|
+
"cartesia/version.py",
|
146
|
+
]
|
133
147
|
|
134
148
|
def finalize_options(self):
|
135
149
|
# This package cannot be imported at top level because it
|
@@ -147,14 +161,18 @@ class BumpVersionCommand(Command):
|
|
147
161
|
)
|
148
162
|
|
149
163
|
def _undo(self):
|
150
|
-
os.system(f"git restore --staged {
|
151
|
-
os.system(f"git checkout -- {
|
164
|
+
os.system(f"git restore --staged {' '.join(self.updated_files)}")
|
165
|
+
os.system(f"git checkout -- {' '.join(self.updated_files)}")
|
166
|
+
|
167
|
+
# Return to the original branch
|
168
|
+
os.system(f"git checkout {self.base_branch}")
|
169
|
+
os.system(f"git branch -D {self.version_branch}")
|
152
170
|
|
153
171
|
def run(self):
|
154
172
|
current_version = about["__version__"]
|
155
173
|
|
156
174
|
self.status("Checking current branch is 'main'")
|
157
|
-
current_branch = get_git_branch()
|
175
|
+
self.base_branch = current_branch = get_git_branch()
|
158
176
|
if current_branch != "main":
|
159
177
|
raise RuntimeError(
|
160
178
|
"You can only bump the version from the 'main' branch. "
|
@@ -174,18 +192,25 @@ class BumpVersionCommand(Command):
|
|
174
192
|
|
175
193
|
# TODO: Add check to see if all tests are passing on main.
|
176
194
|
|
195
|
+
# Checkout new branch
|
196
|
+
self.version_branch = f"bumpversion/v{self.version}"
|
197
|
+
self.status(f"Create branch '{self.version_branch}'")
|
198
|
+
err_code = os.system(f"git checkout -b {self.version_branch}")
|
199
|
+
if err_code != 0:
|
200
|
+
raise RuntimeError("Failed to create branch.")
|
201
|
+
|
177
202
|
# Change the version in __init__.py
|
178
203
|
self.status(f"Updating version {current_version} -> {self.version}")
|
179
204
|
update_version(self.version)
|
180
|
-
if current_version != self.version:
|
181
|
-
|
182
|
-
|
205
|
+
# if current_version != self.version:
|
206
|
+
# self._undo()
|
207
|
+
# raise RuntimeError("Failed to update version.")
|
183
208
|
|
184
|
-
self.status(f"Adding {
|
185
|
-
err_code = os.system(f"git add {
|
209
|
+
self.status(f"Adding {', '.join(self.updated_files)} to git")
|
210
|
+
err_code = os.system(f"git add {' '.join(self.updated_files)}")
|
186
211
|
if err_code != 0:
|
187
212
|
self._undo()
|
188
|
-
raise RuntimeError("Failed to add
|
213
|
+
raise RuntimeError("Failed to add files to git.")
|
189
214
|
|
190
215
|
# Commit the file with a message '[bumpversion] v<version>'.
|
191
216
|
self.status(f"Commit with message '[bumpversion] v{self.version}'")
|
@@ -195,12 +220,15 @@ class BumpVersionCommand(Command):
|
|
195
220
|
raise RuntimeError("Failed to commit file to git.")
|
196
221
|
|
197
222
|
# Push the commit to origin.
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
223
|
+
self.status(f"Pushing commit to origin/{self.version_branch}")
|
224
|
+
err_code = os.system(f"git push --force --set-upstream origin {self.version_branch}")
|
225
|
+
if err_code != 0:
|
226
|
+
# TODO: undo the commit automatically.
|
227
|
+
self._undo()
|
228
|
+
raise RuntimeError("Failed to push commit to origin.")
|
203
229
|
|
230
|
+
os.system(f"git checkout {self.base_branch}")
|
231
|
+
os.system(f"git branch -D {self.version_branch}")
|
204
232
|
sys.exit()
|
205
233
|
|
206
234
|
|
@@ -0,0 +1,180 @@
|
|
1
|
+
"""Test against the production Cartesia TTS API.
|
2
|
+
|
3
|
+
This test suite tries to be as general as possible because different keys
|
4
|
+
will lead to different results. Therefore, we cannot test for complete correctness
|
5
|
+
but rather for general correctness.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
import uuid
|
10
|
+
from typing import Dict, Generator, List
|
11
|
+
|
12
|
+
import pytest
|
13
|
+
|
14
|
+
from cartesia.tts import DEFAULT_MODEL_ID, CartesiaTTS, VoiceMetadata
|
15
|
+
|
16
|
+
SAMPLE_VOICE = "Milo"
|
17
|
+
|
18
|
+
|
19
|
+
class _Resources:
|
20
|
+
def __init__(self, *, client: CartesiaTTS, voices: Dict[str, VoiceMetadata]):
|
21
|
+
self.client = client
|
22
|
+
self.voices = voices
|
23
|
+
|
24
|
+
|
25
|
+
@pytest.fixture(scope="session")
|
26
|
+
def client():
|
27
|
+
return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
28
|
+
|
29
|
+
|
30
|
+
@pytest.fixture(scope="session")
|
31
|
+
def client_with_ws_interrupt():
|
32
|
+
return CartesiaTTS(
|
33
|
+
api_key=os.environ.get("CARTESIA_API_KEY"), experimental_ws_handle_interrupts=True
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
@pytest.fixture(scope="session")
|
38
|
+
def resources(client: CartesiaTTS):
|
39
|
+
voices = client.get_voices()
|
40
|
+
voice_id = voices[SAMPLE_VOICE]["id"]
|
41
|
+
voices[SAMPLE_VOICE]["embedding"] = client.get_voice_embedding(voice_id=voice_id)
|
42
|
+
|
43
|
+
return _Resources(
|
44
|
+
client=client,
|
45
|
+
voices=voices,
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
def test_get_voices(client: CartesiaTTS):
|
50
|
+
voices = client.get_voices()
|
51
|
+
|
52
|
+
assert isinstance(voices, dict)
|
53
|
+
assert all(isinstance(key, str) for key in voices.keys())
|
54
|
+
ids = [voice["id"] for voice in voices.values()]
|
55
|
+
assert len(ids) == len(set(ids)), "All ids must be unique"
|
56
|
+
assert all(
|
57
|
+
key == voice["name"] for key, voice in voices.items()
|
58
|
+
), "The key must be the same as the name"
|
59
|
+
|
60
|
+
|
61
|
+
def test_get_voice_embedding_from_id(client: CartesiaTTS):
|
62
|
+
voices = client.get_voices()
|
63
|
+
voice_id = voices[SAMPLE_VOICE]["id"]
|
64
|
+
|
65
|
+
client.get_voice_embedding(voice_id=voice_id)
|
66
|
+
|
67
|
+
|
68
|
+
def test_get_voice_embedding_from_url(client: CartesiaTTS):
|
69
|
+
url = "https://youtu.be/g2Z7Ddd573M?si=P8BM_hBqt5P8Ft6I&t=69"
|
70
|
+
_ = client.get_voice_embedding(link=url)
|
71
|
+
|
72
|
+
|
73
|
+
@pytest.mark.parametrize("websocket", [True, False])
|
74
|
+
def test_generate(resources: _Resources, websocket: bool):
|
75
|
+
client = resources.client
|
76
|
+
voices = resources.voices
|
77
|
+
embedding = voices[SAMPLE_VOICE]["embedding"]
|
78
|
+
transcript = "Hello, world!"
|
79
|
+
|
80
|
+
output = client.generate(transcript=transcript, voice=embedding, websocket=websocket)
|
81
|
+
assert output.keys() == {"audio", "sampling_rate"}
|
82
|
+
assert isinstance(output["audio"], bytes)
|
83
|
+
assert isinstance(output["sampling_rate"], int)
|
84
|
+
|
85
|
+
|
86
|
+
@pytest.mark.parametrize("websocket", [True, False])
|
87
|
+
def test_generate_stream(resources: _Resources, websocket: bool):
|
88
|
+
client = resources.client
|
89
|
+
voices = resources.voices
|
90
|
+
embedding = voices[SAMPLE_VOICE]["embedding"]
|
91
|
+
transcript = "Hello, world!"
|
92
|
+
|
93
|
+
generator = client.generate(
|
94
|
+
transcript=transcript, voice=embedding, websocket=websocket, stream=True
|
95
|
+
)
|
96
|
+
assert isinstance(generator, Generator)
|
97
|
+
|
98
|
+
for output in generator:
|
99
|
+
assert output.keys() == {"audio", "sampling_rate"}
|
100
|
+
assert isinstance(output["audio"], bytes)
|
101
|
+
assert isinstance(output["sampling_rate"], int)
|
102
|
+
|
103
|
+
|
104
|
+
@pytest.mark.parametrize(
|
105
|
+
"actions",
|
106
|
+
[
|
107
|
+
["cancel-5", None],
|
108
|
+
["cancel-5", "cancel-1", None],
|
109
|
+
[None, "cancel-3", None],
|
110
|
+
[None, "cancel-1", "cancel-2"],
|
111
|
+
],
|
112
|
+
)
|
113
|
+
def test_generate_stream_interrupt(
|
114
|
+
client_with_ws_interrupt: CartesiaTTS, resources: _Resources, actions: List[str]
|
115
|
+
):
|
116
|
+
client = client_with_ws_interrupt
|
117
|
+
voices = resources.voices
|
118
|
+
embedding = voices[SAMPLE_VOICE]["embedding"]
|
119
|
+
transcript = "Hello, world!"
|
120
|
+
|
121
|
+
context_ids = [f"test-{uuid.uuid4().hex[:6]}" for _ in range(len(actions))]
|
122
|
+
|
123
|
+
for context_id, action in zip(context_ids, actions):
|
124
|
+
body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=embedding)
|
125
|
+
|
126
|
+
# Parse actions to see what we should expect.
|
127
|
+
if action is None:
|
128
|
+
num_turns = None
|
129
|
+
elif "cancel" in action:
|
130
|
+
num_turns = int(action.split("-")[1])
|
131
|
+
|
132
|
+
generator = client._generate_ws(body, context_id=context_id)
|
133
|
+
for idx, response in enumerate(generator):
|
134
|
+
assert response.keys() == {"audio", "sampling_rate", "context_id"}
|
135
|
+
assert response["context_id"] == context_id, (
|
136
|
+
f"Context ID from response ({response['context_id']}) does not match "
|
137
|
+
f"the expected context ID ({context_id})"
|
138
|
+
)
|
139
|
+
if idx + 1 == num_turns:
|
140
|
+
break
|
141
|
+
|
142
|
+
|
143
|
+
@pytest.mark.parametrize("chunk_time", [0.05, 0.6])
|
144
|
+
def test_check_inputs_invalid_chunk_time(client: CartesiaTTS, chunk_time):
|
145
|
+
with pytest.raises(ValueError, match="`chunk_time` must be between 0.1 and 0.5"):
|
146
|
+
client._check_inputs("Test", None, chunk_time)
|
147
|
+
|
148
|
+
|
149
|
+
@pytest.mark.parametrize("chunk_time", [0.1, 0.3, 0.5])
|
150
|
+
def test_check_inputs_valid_chunk_time(client, chunk_time):
|
151
|
+
try:
|
152
|
+
client._check_inputs("Test", None, chunk_time)
|
153
|
+
except ValueError:
|
154
|
+
pytest.fail("Unexpected ValueError raised")
|
155
|
+
|
156
|
+
|
157
|
+
def test_check_inputs_duration_less_than_chunk_time(client: CartesiaTTS):
|
158
|
+
with pytest.raises(ValueError, match="`duration` must be greater than chunk_time"):
|
159
|
+
client._check_inputs("Test", 0.2, 0.3)
|
160
|
+
|
161
|
+
|
162
|
+
@pytest.mark.parametrize("duration,chunk_time", [(0.5, 0.2), (1.0, 0.5), (2.0, 0.1)])
|
163
|
+
def test_check_inputs_valid_duration_and_chunk_time(client: CartesiaTTS, duration, chunk_time):
|
164
|
+
try:
|
165
|
+
client._check_inputs("Test", duration, chunk_time)
|
166
|
+
except ValueError:
|
167
|
+
pytest.fail("Unexpected ValueError raised")
|
168
|
+
|
169
|
+
|
170
|
+
def test_check_inputs_empty_transcript(client: CartesiaTTS):
|
171
|
+
with pytest.raises(ValueError, match="`transcript` must be non empty"):
|
172
|
+
client._check_inputs("", None, None)
|
173
|
+
|
174
|
+
|
175
|
+
@pytest.mark.parametrize("transcript", ["Hello", "Test transcript", "Lorem ipsum dolor sit amet"])
|
176
|
+
def test_check_inputs_valid_transcript(client: CartesiaTTS, transcript):
|
177
|
+
try:
|
178
|
+
client._check_inputs(transcript, None, None)
|
179
|
+
except ValueError:
|
180
|
+
pytest.fail("Unexpected ValueError raised")
|
cartesia-0.0.2/README.md
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
# Cartesia Python API Library
|
2
|
-
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
3
|
-
|
4
|
-
**Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
|
5
|
-
|
6
|
-
## Installation
|
7
|
-
```bash
|
8
|
-
pip install cartesia
|
9
|
-
|
10
|
-
# pip install in editable mode w/ dev dependencies
|
11
|
-
pip install -e '.[dev]'
|
12
|
-
```
|
13
|
-
|
14
|
-
## Usage
|
15
|
-
```python
|
16
|
-
from cartesia.tts import CartesiaTTS
|
17
|
-
from IPython.display import Audio
|
18
|
-
|
19
|
-
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
20
|
-
|
21
|
-
voices = client.get_voices()
|
22
|
-
embedding = voices["Milo"]["embedding"]
|
23
|
-
transcript = "Hello! Welcome to Cartesia"
|
24
|
-
|
25
|
-
# No streaming
|
26
|
-
output = client.generate(transcript=transcript, voice=embedding)
|
27
|
-
Audio(output["audio"], rate=output["sampling_rate"])
|
28
|
-
|
29
|
-
# Streaming
|
30
|
-
for output in client.generate(transcript=transcript, voice=embedding, stream=True):
|
31
|
-
arr = output["audio"] # a numpy array
|
32
|
-
rate = output["sampling_rate"]
|
33
|
-
```
|
34
|
-
|
35
|
-
We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "0.0.2"
|
cartesia-0.0.2/tests/test_tts.py
DELETED
@@ -1,96 +0,0 @@
|
|
1
|
-
"""Test against the production Cartesia TTS API.
|
2
|
-
|
3
|
-
This test suite tries to be as general as possible because different keys
|
4
|
-
will lead to different results. Therefore, we cannot test for complete correctness
|
5
|
-
but rather for general correctness.
|
6
|
-
"""
|
7
|
-
|
8
|
-
import os
|
9
|
-
from typing import Dict, Generator
|
10
|
-
|
11
|
-
import numpy as np
|
12
|
-
import pytest
|
13
|
-
|
14
|
-
from cartesia.tts import CartesiaTTS, VoiceMetadata
|
15
|
-
|
16
|
-
SAMPLE_VOICE = "Milo"
|
17
|
-
|
18
|
-
|
19
|
-
class _Resources:
|
20
|
-
def __init__(self, *, client: CartesiaTTS, voices: Dict[str, VoiceMetadata]):
|
21
|
-
self.client = client
|
22
|
-
self.voices = voices
|
23
|
-
|
24
|
-
|
25
|
-
@pytest.fixture(scope="session")
|
26
|
-
def client():
|
27
|
-
return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
28
|
-
|
29
|
-
|
30
|
-
@pytest.fixture(scope="session")
|
31
|
-
def resources(client: CartesiaTTS):
|
32
|
-
voices = client.get_voices()
|
33
|
-
voice_id = voices[SAMPLE_VOICE]["id"]
|
34
|
-
voices[SAMPLE_VOICE]["embedding"] = client.get_voice_embedding(voice_id=voice_id)
|
35
|
-
|
36
|
-
return _Resources(
|
37
|
-
client=client,
|
38
|
-
voices=voices,
|
39
|
-
)
|
40
|
-
|
41
|
-
|
42
|
-
def test_get_voices(client: CartesiaTTS):
|
43
|
-
voices = client.get_voices()
|
44
|
-
|
45
|
-
assert isinstance(voices, dict)
|
46
|
-
assert all(isinstance(key, str) for key in voices.keys())
|
47
|
-
ids = [voice["id"] for voice in voices.values()]
|
48
|
-
assert len(ids) == len(set(ids)), "All ids must be unique"
|
49
|
-
assert all(
|
50
|
-
key == voice["name"] for key, voice in voices.items()
|
51
|
-
), "The key must be the same as the name"
|
52
|
-
|
53
|
-
|
54
|
-
def test_get_voice_embedding_from_id(client: CartesiaTTS):
|
55
|
-
voices = client.get_voices()
|
56
|
-
voice_id = voices[SAMPLE_VOICE]["id"]
|
57
|
-
|
58
|
-
client.get_voice_embedding(voice_id=voice_id)
|
59
|
-
|
60
|
-
|
61
|
-
def test_get_voice_embedding_from_url(client: CartesiaTTS):
|
62
|
-
url = "https://youtu.be/g2Z7Ddd573M?si=P8BM_hBqt5P8Ft6I&t=69"
|
63
|
-
_ = client.get_voice_embedding(link=url)
|
64
|
-
|
65
|
-
|
66
|
-
@pytest.mark.parametrize("websocket", [True, False])
|
67
|
-
def test_generate(resources: _Resources, websocket: bool):
|
68
|
-
client = resources.client
|
69
|
-
voices = resources.voices
|
70
|
-
embedding = voices[SAMPLE_VOICE]["embedding"]
|
71
|
-
transcript = "Hello, world!"
|
72
|
-
|
73
|
-
output = client.generate(transcript=transcript, voice=embedding, websocket=websocket)
|
74
|
-
assert output.keys() == {"audio", "sampling_rate"}
|
75
|
-
assert isinstance(output["audio"], np.ndarray)
|
76
|
-
assert output["audio"].dtype == np.float32
|
77
|
-
assert isinstance(output["sampling_rate"], int)
|
78
|
-
|
79
|
-
|
80
|
-
@pytest.mark.parametrize("websocket", [True, False])
|
81
|
-
def test_generate_stream(resources: _Resources, websocket: bool):
|
82
|
-
client = resources.client
|
83
|
-
voices = resources.voices
|
84
|
-
embedding = voices[SAMPLE_VOICE]["embedding"]
|
85
|
-
transcript = "Hello, world!"
|
86
|
-
|
87
|
-
generator = client.generate(
|
88
|
-
transcript=transcript, voice=embedding, websocket=websocket, stream=True
|
89
|
-
)
|
90
|
-
assert isinstance(generator, Generator)
|
91
|
-
|
92
|
-
for output in generator:
|
93
|
-
assert output.keys() == {"audio", "sampling_rate"}
|
94
|
-
assert isinstance(output["audio"], np.ndarray)
|
95
|
-
assert output["audio"].dtype == np.float32
|
96
|
-
assert isinstance(output["sampling_rate"], int)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|