cartesia 0.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +3 -0
- cartesia/tts.py +304 -0
- cartesia/version.py +1 -0
- cartesia-0.0.2.dist-info/METADATA +70 -0
- cartesia-0.0.2.dist-info/RECORD +7 -0
- cartesia-0.0.2.dist-info/WHEEL +6 -0
- cartesia-0.0.2.dist-info/top_level.txt +1 -0
cartesia/__init__.py
ADDED
cartesia/tts.py
ADDED
@@ -0,0 +1,304 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
import uuid
|
5
|
+
from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import requests
|
9
|
+
from websockets.sync.client import connect
|
10
|
+
|
11
|
+
DEFAULT_MODEL_ID = "genial-planet-1346"
|
12
|
+
DEFAULT_BASE_URL = "api.cartesia.ai"
|
13
|
+
DEFAULT_API_VERSION = "v0"
|
14
|
+
|
15
|
+
|
16
|
+
class AudioOutput(TypedDict):
|
17
|
+
audio: np.ndarray
|
18
|
+
sampling_rate: int
|
19
|
+
|
20
|
+
|
21
|
+
Embedding = List[float]
|
22
|
+
|
23
|
+
|
24
|
+
class VoiceMetadata(TypedDict):
|
25
|
+
id: str
|
26
|
+
name: str
|
27
|
+
description: str
|
28
|
+
embedding: Optional[Embedding]
|
29
|
+
|
30
|
+
|
31
|
+
class CartesiaTTS:
|
32
|
+
"""The client for Cartesia's text-to-speech library.
|
33
|
+
|
34
|
+
This client contains methods to interact with the Cartesia text-to-speech API.
|
35
|
+
The API offers
|
36
|
+
|
37
|
+
Examples:
|
38
|
+
|
39
|
+
>>> client = CartesiaTTS()
|
40
|
+
|
41
|
+
# Load available voices and their metadata (excluding the embeddings).
|
42
|
+
# Embeddings are fetched with `get_voice_embedding`. This avoids preloading
|
43
|
+
# all of the embeddings, which can be expensive if there are a lot of voices.
|
44
|
+
>>> voices = client.get_voices()
|
45
|
+
>>> embedding = client.get_voice_embedding(voice_id=voices["Milo"]["id"])
|
46
|
+
>>> audio = client.generate(transcript="Hello world!", voice=embedding)
|
47
|
+
|
48
|
+
# Preload all available voices and their embeddings if you plan on reusing
|
49
|
+
# all of the embeddings often.
|
50
|
+
>>> voices = client.get_voices(skip_embeddings=False)
|
51
|
+
>>> embedding = voices["Milo"]["embedding"]
|
52
|
+
>>> audio = client.generate(transcript="Hello world!", voice=embedding)
|
53
|
+
|
54
|
+
# Generate audio stream
|
55
|
+
>>> for audio_chunk in client.generate(transcript="Hello world!", voice=embedding, stream=True):
|
56
|
+
... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
|
57
|
+
"""
|
58
|
+
|
59
|
+
def __init__(self, *, api_key: str = None):
|
60
|
+
"""
|
61
|
+
Args:
|
62
|
+
api_key: The API key to use for authorization.
|
63
|
+
If not specified, the API key will be read from the environment variable
|
64
|
+
`CARTESIA_API_KEY`.
|
65
|
+
"""
|
66
|
+
self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
|
67
|
+
self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
|
68
|
+
self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
|
69
|
+
self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
|
70
|
+
self.websocket = None
|
71
|
+
self.refresh_websocket()
|
72
|
+
|
73
|
+
def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
|
74
|
+
"""Returns a mapping from voice name -> voice metadata.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
skip_embeddings: Whether to skip returning the embeddings.
|
78
|
+
It is recommended to skip if you only want to see what
|
79
|
+
voices are available, since loading embeddings for all your voices can be expensive.
|
80
|
+
You can then use ``get_voice_embedding`` to get the embeddings for the voices you are
|
81
|
+
interested in.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
A mapping from voice name -> voice metadata.
|
85
|
+
|
86
|
+
Note:
|
87
|
+
If the voice name is not unique, there is undefined behavior as to which
|
88
|
+
voice will correspond to the name. To be more thorough, look at the web
|
89
|
+
client to find the `voice_id` for the voice you are looking for.
|
90
|
+
|
91
|
+
Usage:
|
92
|
+
>>> client = CartesiaTTS()
|
93
|
+
>>> voices = client.get_voices()
|
94
|
+
>>> voices
|
95
|
+
{
|
96
|
+
"Jane": {
|
97
|
+
"id": "c1d1d3a8-6f4e-4b3f-8b3e-2e1b3e1b3e1b",
|
98
|
+
"name": "Jane",
|
99
|
+
}
|
100
|
+
>>> embedding = client.get_voice_embedding(voice_id=voices["Jane"]["id"])
|
101
|
+
>>> audio = client.generate(transcript="Hello world!", voice=embedding)
|
102
|
+
"""
|
103
|
+
params = {"select": "id, name, description"} if skip_embeddings else None
|
104
|
+
response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
|
105
|
+
|
106
|
+
if response.status_code != 200:
|
107
|
+
raise ValueError(f"Failed to get voices. Error: {response.text}")
|
108
|
+
|
109
|
+
voices = response.json()
|
110
|
+
# TODO: Update the API to return the embedding as a list of floats rather than string.
|
111
|
+
if not skip_embeddings:
|
112
|
+
for voice in voices:
|
113
|
+
voice["embedding"] = json.loads(voice["embedding"])
|
114
|
+
return {voice["name"]: voice for voice in voices}
|
115
|
+
|
116
|
+
def get_voice_embedding(
|
117
|
+
self, *, voice_id: str = None, filepath: str = None, link: str = None
|
118
|
+
) -> Embedding:
|
119
|
+
"""Get a voice embedding from voice_id, a filepath or YouTube url.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
voice_id: The voice id.
|
123
|
+
filepath: Path to audio file from which to get the audio.
|
124
|
+
link: The url to get the audio from. Currently only supports youtube shared urls.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
The voice embedding.
|
128
|
+
|
129
|
+
Raises:
|
130
|
+
ValueError: If more than one of `voice_id`, `filepath` or `link` is specified.
|
131
|
+
Only one should be specified.
|
132
|
+
"""
|
133
|
+
if sum(bool(x) for x in (voice_id, filepath, link)) != 1:
|
134
|
+
raise ValueError("Exactly one of `voice_id`, `filepath` or `url` should be specified.")
|
135
|
+
|
136
|
+
if voice_id:
|
137
|
+
url = f"{self._http_url()}/voices/embedding/{voice_id}"
|
138
|
+
response = requests.get(url, headers=self.headers)
|
139
|
+
elif filepath:
|
140
|
+
url = f"{self._http_url()}/voices/clone/clip"
|
141
|
+
files = {"clip": open(filepath, "rb")}
|
142
|
+
headers = self.headers.copy()
|
143
|
+
# The default content type of JSON is incorrect for file uploads
|
144
|
+
headers.pop("Content-Type")
|
145
|
+
response = requests.post(url, headers=headers, files=files)
|
146
|
+
elif link:
|
147
|
+
url = f"{self._http_url()}/voices/clone/url"
|
148
|
+
params = {"link": link}
|
149
|
+
response = requests.post(url, headers=self.headers, params=params)
|
150
|
+
|
151
|
+
if response.status_code != 200:
|
152
|
+
raise ValueError(
|
153
|
+
f"Failed to clone voice. Status Code: {response.status_code}\n"
|
154
|
+
f"Error: {response.text}"
|
155
|
+
)
|
156
|
+
|
157
|
+
# Handle successful response
|
158
|
+
out = response.json()
|
159
|
+
if isinstance(out["embedding"], str):
|
160
|
+
out["embedding"] = json.loads(out["embedding"])
|
161
|
+
return out["embedding"]
|
162
|
+
|
163
|
+
def refresh_websocket(self):
|
164
|
+
"""Refresh the websocket connection.
|
165
|
+
|
166
|
+
Note:
|
167
|
+
The connection is synchronous.
|
168
|
+
"""
|
169
|
+
if self.websocket and not self._is_websocket_closed():
|
170
|
+
self.websocket.close()
|
171
|
+
self.websocket = connect(
|
172
|
+
f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
|
173
|
+
close_timeout=None,
|
174
|
+
)
|
175
|
+
|
176
|
+
def _is_websocket_closed(self):
|
177
|
+
return self.websocket.socket.fileno() == -1
|
178
|
+
|
179
|
+
def generate(
|
180
|
+
self,
|
181
|
+
*,
|
182
|
+
transcript: str,
|
183
|
+
duration: int = None,
|
184
|
+
chunk_time: float = None,
|
185
|
+
lookahead: int = None,
|
186
|
+
voice: Embedding = None,
|
187
|
+
stream: bool = False,
|
188
|
+
websocket: bool = True,
|
189
|
+
) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
|
190
|
+
"""Generate audio from a transcript.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
transcript: The text to generate audio for.
|
194
|
+
duration: The maximum duration of the audio in seconds.
|
195
|
+
chunk_time: How long each audio segment should be in seconds.
|
196
|
+
This should not need to be adjusted.
|
197
|
+
lookahead: The number of seconds to look ahead for each chunk.
|
198
|
+
This should not need to be adjusted.
|
199
|
+
voice: The voice to use for generating audio.
|
200
|
+
This can either be a voice id (string) or an embedding vector (List[float]).
|
201
|
+
stream: Whether to stream the audio or not.
|
202
|
+
If ``True`` this function returns a generator.
|
203
|
+
websocket: Whether to use a websocket for streaming audio.
|
204
|
+
Using the websocket reduces latency by pre-poning the handshake.
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
A generator if `stream` is True, otherwise a dictionary.
|
208
|
+
Dictionary from both generator and non-generator return types have the following keys:
|
209
|
+
* "audio": The audio as a 1D numpy array.
|
210
|
+
* "sampling_rate": The sampling rate of the audio.
|
211
|
+
"""
|
212
|
+
body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
|
213
|
+
|
214
|
+
if isinstance(voice, str):
|
215
|
+
voice = self._voices[voice]
|
216
|
+
|
217
|
+
optional_body = dict(
|
218
|
+
duration=duration,
|
219
|
+
chunk_time=chunk_time,
|
220
|
+
lookahead=lookahead,
|
221
|
+
voice=voice,
|
222
|
+
)
|
223
|
+
body.update({k: v for k, v in optional_body.items() if v is not None})
|
224
|
+
|
225
|
+
if websocket:
|
226
|
+
generator = self._generate_ws(body)
|
227
|
+
else:
|
228
|
+
generator = self._generate_http(body)
|
229
|
+
|
230
|
+
if stream:
|
231
|
+
return generator
|
232
|
+
|
233
|
+
chunks = []
|
234
|
+
sampling_rate = None
|
235
|
+
for chunk in generator:
|
236
|
+
if sampling_rate is None:
|
237
|
+
sampling_rate = chunk["sampling_rate"]
|
238
|
+
chunks.append(chunk["audio"])
|
239
|
+
|
240
|
+
return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
|
241
|
+
|
242
|
+
def _generate_http(self, body: Dict[str, Any]):
|
243
|
+
response = requests.post(
|
244
|
+
f"{self._http_url()}/audio/stream",
|
245
|
+
stream=True,
|
246
|
+
data=json.dumps(body),
|
247
|
+
headers=self.headers,
|
248
|
+
)
|
249
|
+
if response.status_code != 200:
|
250
|
+
raise ValueError(f"Failed to generate audio. {response.text}")
|
251
|
+
|
252
|
+
buffer = ""
|
253
|
+
for chunk_bytes in response.iter_content(chunk_size=None):
|
254
|
+
buffer += chunk_bytes.decode("utf-8")
|
255
|
+
while "{" in buffer and "}" in buffer:
|
256
|
+
start_index = buffer.find("{")
|
257
|
+
end_index = buffer.find("}", start_index)
|
258
|
+
if start_index != -1 and end_index != -1:
|
259
|
+
try:
|
260
|
+
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
261
|
+
data = base64.b64decode(chunk_json["data"])
|
262
|
+
audio = np.frombuffer(data, dtype=np.float32)
|
263
|
+
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
264
|
+
buffer = buffer[end_index + 1 :]
|
265
|
+
except json.JSONDecodeError:
|
266
|
+
break
|
267
|
+
|
268
|
+
if buffer:
|
269
|
+
try:
|
270
|
+
chunk_json = json.loads(buffer)
|
271
|
+
data = base64.b64decode(chunk_json["data"])
|
272
|
+
audio = np.frombuffer(data, dtype=np.float32)
|
273
|
+
yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
|
274
|
+
except json.JSONDecodeError:
|
275
|
+
pass
|
276
|
+
|
277
|
+
def _generate_ws(self, body: Dict[str, Any]):
|
278
|
+
if not self.websocket or self._is_websocket_closed():
|
279
|
+
self.refresh_websocket()
|
280
|
+
|
281
|
+
self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
|
282
|
+
try:
|
283
|
+
response = json.loads(self.websocket.recv())
|
284
|
+
while not response["done"]:
|
285
|
+
data = base64.b64decode(response["data"])
|
286
|
+
audio = np.frombuffer(data, dtype=np.float32)
|
287
|
+
# print("timing", time.perf_counter() - start)
|
288
|
+
yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
|
289
|
+
|
290
|
+
response = json.loads(self.websocket.recv())
|
291
|
+
except Exception:
|
292
|
+
raise RuntimeError(f"Failed to generate audio. {response}")
|
293
|
+
|
294
|
+
def _http_url(self):
|
295
|
+
prefix = "http" if "localhost" in self.base_url else "https"
|
296
|
+
return f"{prefix}://{self.base_url}/{self.api_version}"
|
297
|
+
|
298
|
+
def _ws_url(self):
|
299
|
+
prefix = "ws" if "localhost" in self.base_url else "wss"
|
300
|
+
return f"{prefix}://{self.base_url}/{self.api_version}"
|
301
|
+
|
302
|
+
def __del__(self):
|
303
|
+
if self.websocket.socket.fileno() > -1:
|
304
|
+
self.websocket.close()
|
cartesia/version.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.0.2"
|
@@ -0,0 +1,70 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: cartesia
|
3
|
+
Version: 0.0.2
|
4
|
+
Summary: The official Python library for the Cartesia API.
|
5
|
+
Home-page:
|
6
|
+
Author: Cartesia, Inc.
|
7
|
+
Author-email: support@cartesia.ai
|
8
|
+
Classifier: Programming Language :: Python
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
11
|
+
Requires-Python: >=3.8.0
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
Requires-Dist: websockets
|
14
|
+
Requires-Dist: requests
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Provides-Extra: all
|
17
|
+
Requires-Dist: pre-commit ; extra == 'all'
|
18
|
+
Requires-Dist: docformatter ; extra == 'all'
|
19
|
+
Requires-Dist: black ==24.1.1 ; extra == 'all'
|
20
|
+
Requires-Dist: isort ==5.13.2 ; extra == 'all'
|
21
|
+
Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
|
22
|
+
Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
|
23
|
+
Requires-Dist: pytest >=8.0.2 ; extra == 'all'
|
24
|
+
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
|
25
|
+
Provides-Extra: dev
|
26
|
+
Requires-Dist: pre-commit ; extra == 'dev'
|
27
|
+
Requires-Dist: docformatter ; extra == 'dev'
|
28
|
+
Requires-Dist: black ==24.1.1 ; extra == 'dev'
|
29
|
+
Requires-Dist: isort ==5.13.2 ; extra == 'dev'
|
30
|
+
Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
|
31
|
+
Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
|
32
|
+
Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
|
33
|
+
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
|
34
|
+
|
35
|
+
|
36
|
+
# Cartesia Python API Library
|
37
|
+
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
38
|
+
|
39
|
+
**Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
|
40
|
+
|
41
|
+
## Installation
|
42
|
+
```bash
|
43
|
+
pip install cartesia
|
44
|
+
|
45
|
+
# pip install in editable mode w/ dev dependencies
|
46
|
+
pip install -e '.[dev]'
|
47
|
+
```
|
48
|
+
|
49
|
+
## Usage
|
50
|
+
```python
|
51
|
+
from cartesia.tts import CartesiaTTS
|
52
|
+
from IPython.display import Audio
|
53
|
+
|
54
|
+
client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
|
55
|
+
|
56
|
+
voices = client.get_voices()
|
57
|
+
embedding = voices["Milo"]["embedding"]
|
58
|
+
transcript = "Hello! Welcome to Cartesia"
|
59
|
+
|
60
|
+
# No streaming
|
61
|
+
output = client.generate(transcript=transcript, voice=embedding)
|
62
|
+
Audio(output["audio"], rate=output["sampling_rate"])
|
63
|
+
|
64
|
+
# Streaming
|
65
|
+
for output in client.generate(transcript=transcript, voice=embedding, stream=True):
|
66
|
+
arr = output["audio"] # a numpy array
|
67
|
+
rate = output["sampling_rate"]
|
68
|
+
```
|
69
|
+
|
70
|
+
We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
|
@@ -0,0 +1,7 @@
|
|
1
|
+
cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
|
2
|
+
cartesia/tts.py,sha256=kQjkQhWfVrLFH6yaOb6G65HXtFDzPwLa6Q9AwVgIyCI,11901
|
3
|
+
cartesia/version.py,sha256=QvlVh4JTl3JL7jQAja76yKtT-IvF4631ASjWY1wS6AQ,22
|
4
|
+
cartesia-0.0.2.dist-info/METADATA,sha256=7BcDRyB4vxCWxcJhTbe_cWEQXCNOiEJdqoWh8WouNGs,2465
|
5
|
+
cartesia-0.0.2.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
6
|
+
cartesia-0.0.2.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
7
|
+
cartesia-0.0.2.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
cartesia
|