cartesia 0.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from cartesia.tts import CartesiaTTS
2
+
3
+ __all__ = ["CartesiaTTS"]
cartesia/tts.py ADDED
@@ -0,0 +1,304 @@
1
+ import base64
2
+ import json
3
+ import os
4
+ import uuid
5
+ from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
+
7
+ import numpy as np
8
+ import requests
9
+ from websockets.sync.client import connect
10
+
11
+ DEFAULT_MODEL_ID = "genial-planet-1346"
12
+ DEFAULT_BASE_URL = "api.cartesia.ai"
13
+ DEFAULT_API_VERSION = "v0"
14
+
15
+
16
+ class AudioOutput(TypedDict):
17
+ audio: np.ndarray
18
+ sampling_rate: int
19
+
20
+
21
+ Embedding = List[float]
22
+
23
+
24
+ class VoiceMetadata(TypedDict):
25
+ id: str
26
+ name: str
27
+ description: str
28
+ embedding: Optional[Embedding]
29
+
30
+
31
+ class CartesiaTTS:
32
+ """The client for Cartesia's text-to-speech library.
33
+
34
+ This client contains methods to interact with the Cartesia text-to-speech API.
35
+ The API offers
36
+
37
+ Examples:
38
+
39
+ >>> client = CartesiaTTS()
40
+
41
+ # Load available voices and their metadata (excluding the embeddings).
42
+ # Embeddings are fetched with `get_voice_embedding`. This avoids preloading
43
+ # all of the embeddings, which can be expensive if there are a lot of voices.
44
+ >>> voices = client.get_voices()
45
+ >>> embedding = client.get_voice_embedding(voice_id=voices["Milo"]["id"])
46
+ >>> audio = client.generate(transcript="Hello world!", voice=embedding)
47
+
48
+ # Preload all available voices and their embeddings if you plan on reusing
49
+ # all of the embeddings often.
50
+ >>> voices = client.get_voices(skip_embeddings=False)
51
+ >>> embedding = voices["Milo"]["embedding"]
52
+ >>> audio = client.generate(transcript="Hello world!", voice=embedding)
53
+
54
+ # Generate audio stream
55
+ >>> for audio_chunk in client.generate(transcript="Hello world!", voice=embedding, stream=True):
56
+ ... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
57
+ """
58
+
59
+ def __init__(self, *, api_key: str = None):
60
+ """
61
+ Args:
62
+ api_key: The API key to use for authorization.
63
+ If not specified, the API key will be read from the environment variable
64
+ `CARTESIA_API_KEY`.
65
+ """
66
+ self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
67
+ self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
68
+ self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
69
+ self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
70
+ self.websocket = None
71
+ self.refresh_websocket()
72
+
73
+ def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
74
+ """Returns a mapping from voice name -> voice metadata.
75
+
76
+ Args:
77
+ skip_embeddings: Whether to skip returning the embeddings.
78
+ It is recommended to skip if you only want to see what
79
+ voices are available, since loading embeddings for all your voices can be expensive.
80
+ You can then use ``get_voice_embedding`` to get the embeddings for the voices you are
81
+ interested in.
82
+
83
+ Returns:
84
+ A mapping from voice name -> voice metadata.
85
+
86
+ Note:
87
+ If the voice name is not unique, there is undefined behavior as to which
88
+ voice will correspond to the name. To be more thorough, look at the web
89
+ client to find the `voice_id` for the voice you are looking for.
90
+
91
+ Usage:
92
+ >>> client = CartesiaTTS()
93
+ >>> voices = client.get_voices()
94
+ >>> voices
95
+ {
96
+ "Jane": {
97
+ "id": "c1d1d3a8-6f4e-4b3f-8b3e-2e1b3e1b3e1b",
98
+ "name": "Jane",
99
+ }
100
+ >>> embedding = client.get_voice_embedding(voice_id=voices["Jane"]["id"])
101
+ >>> audio = client.generate(transcript="Hello world!", voice=embedding)
102
+ """
103
+ params = {"select": "id, name, description"} if skip_embeddings else None
104
+ response = requests.get(f"{self._http_url()}/voices", headers=self.headers, params=params)
105
+
106
+ if response.status_code != 200:
107
+ raise ValueError(f"Failed to get voices. Error: {response.text}")
108
+
109
+ voices = response.json()
110
+ # TODO: Update the API to return the embedding as a list of floats rather than string.
111
+ if not skip_embeddings:
112
+ for voice in voices:
113
+ voice["embedding"] = json.loads(voice["embedding"])
114
+ return {voice["name"]: voice for voice in voices}
115
+
116
+ def get_voice_embedding(
117
+ self, *, voice_id: str = None, filepath: str = None, link: str = None
118
+ ) -> Embedding:
119
+ """Get a voice embedding from voice_id, a filepath or YouTube url.
120
+
121
+ Args:
122
+ voice_id: The voice id.
123
+ filepath: Path to audio file from which to get the audio.
124
+ link: The url to get the audio from. Currently only supports youtube shared urls.
125
+
126
+ Returns:
127
+ The voice embedding.
128
+
129
+ Raises:
130
+ ValueError: If more than one of `voice_id`, `filepath` or `link` is specified.
131
+ Only one should be specified.
132
+ """
133
+ if sum(bool(x) for x in (voice_id, filepath, link)) != 1:
134
+ raise ValueError("Exactly one of `voice_id`, `filepath` or `url` should be specified.")
135
+
136
+ if voice_id:
137
+ url = f"{self._http_url()}/voices/embedding/{voice_id}"
138
+ response = requests.get(url, headers=self.headers)
139
+ elif filepath:
140
+ url = f"{self._http_url()}/voices/clone/clip"
141
+ files = {"clip": open(filepath, "rb")}
142
+ headers = self.headers.copy()
143
+ # The default content type of JSON is incorrect for file uploads
144
+ headers.pop("Content-Type")
145
+ response = requests.post(url, headers=headers, files=files)
146
+ elif link:
147
+ url = f"{self._http_url()}/voices/clone/url"
148
+ params = {"link": link}
149
+ response = requests.post(url, headers=self.headers, params=params)
150
+
151
+ if response.status_code != 200:
152
+ raise ValueError(
153
+ f"Failed to clone voice. Status Code: {response.status_code}\n"
154
+ f"Error: {response.text}"
155
+ )
156
+
157
+ # Handle successful response
158
+ out = response.json()
159
+ if isinstance(out["embedding"], str):
160
+ out["embedding"] = json.loads(out["embedding"])
161
+ return out["embedding"]
162
+
163
+ def refresh_websocket(self):
164
+ """Refresh the websocket connection.
165
+
166
+ Note:
167
+ The connection is synchronous.
168
+ """
169
+ if self.websocket and not self._is_websocket_closed():
170
+ self.websocket.close()
171
+ self.websocket = connect(
172
+ f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
173
+ close_timeout=None,
174
+ )
175
+
176
+ def _is_websocket_closed(self):
177
+ return self.websocket.socket.fileno() == -1
178
+
179
+ def generate(
180
+ self,
181
+ *,
182
+ transcript: str,
183
+ duration: int = None,
184
+ chunk_time: float = None,
185
+ lookahead: int = None,
186
+ voice: Embedding = None,
187
+ stream: bool = False,
188
+ websocket: bool = True,
189
+ ) -> Union[AudioOutput, Generator[AudioOutput, None, None]]:
190
+ """Generate audio from a transcript.
191
+
192
+ Args:
193
+ transcript: The text to generate audio for.
194
+ duration: The maximum duration of the audio in seconds.
195
+ chunk_time: How long each audio segment should be in seconds.
196
+ This should not need to be adjusted.
197
+ lookahead: The number of seconds to look ahead for each chunk.
198
+ This should not need to be adjusted.
199
+ voice: The voice to use for generating audio.
200
+ This can either be a voice id (string) or an embedding vector (List[float]).
201
+ stream: Whether to stream the audio or not.
202
+ If ``True`` this function returns a generator.
203
+ websocket: Whether to use a websocket for streaming audio.
204
+ Using the websocket reduces latency by pre-poning the handshake.
205
+
206
+ Returns:
207
+ A generator if `stream` is True, otherwise a dictionary.
208
+ Dictionary from both generator and non-generator return types have the following keys:
209
+ * "audio": The audio as a 1D numpy array.
210
+ * "sampling_rate": The sampling rate of the audio.
211
+ """
212
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
213
+
214
+ if isinstance(voice, str):
215
+ voice = self._voices[voice]
216
+
217
+ optional_body = dict(
218
+ duration=duration,
219
+ chunk_time=chunk_time,
220
+ lookahead=lookahead,
221
+ voice=voice,
222
+ )
223
+ body.update({k: v for k, v in optional_body.items() if v is not None})
224
+
225
+ if websocket:
226
+ generator = self._generate_ws(body)
227
+ else:
228
+ generator = self._generate_http(body)
229
+
230
+ if stream:
231
+ return generator
232
+
233
+ chunks = []
234
+ sampling_rate = None
235
+ for chunk in generator:
236
+ if sampling_rate is None:
237
+ sampling_rate = chunk["sampling_rate"]
238
+ chunks.append(chunk["audio"])
239
+
240
+ return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
241
+
242
+ def _generate_http(self, body: Dict[str, Any]):
243
+ response = requests.post(
244
+ f"{self._http_url()}/audio/stream",
245
+ stream=True,
246
+ data=json.dumps(body),
247
+ headers=self.headers,
248
+ )
249
+ if response.status_code != 200:
250
+ raise ValueError(f"Failed to generate audio. {response.text}")
251
+
252
+ buffer = ""
253
+ for chunk_bytes in response.iter_content(chunk_size=None):
254
+ buffer += chunk_bytes.decode("utf-8")
255
+ while "{" in buffer and "}" in buffer:
256
+ start_index = buffer.find("{")
257
+ end_index = buffer.find("}", start_index)
258
+ if start_index != -1 and end_index != -1:
259
+ try:
260
+ chunk_json = json.loads(buffer[start_index : end_index + 1])
261
+ data = base64.b64decode(chunk_json["data"])
262
+ audio = np.frombuffer(data, dtype=np.float32)
263
+ yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
264
+ buffer = buffer[end_index + 1 :]
265
+ except json.JSONDecodeError:
266
+ break
267
+
268
+ if buffer:
269
+ try:
270
+ chunk_json = json.loads(buffer)
271
+ data = base64.b64decode(chunk_json["data"])
272
+ audio = np.frombuffer(data, dtype=np.float32)
273
+ yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
274
+ except json.JSONDecodeError:
275
+ pass
276
+
277
+ def _generate_ws(self, body: Dict[str, Any]):
278
+ if not self.websocket or self._is_websocket_closed():
279
+ self.refresh_websocket()
280
+
281
+ self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
282
+ try:
283
+ response = json.loads(self.websocket.recv())
284
+ while not response["done"]:
285
+ data = base64.b64decode(response["data"])
286
+ audio = np.frombuffer(data, dtype=np.float32)
287
+ # print("timing", time.perf_counter() - start)
288
+ yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
289
+
290
+ response = json.loads(self.websocket.recv())
291
+ except Exception:
292
+ raise RuntimeError(f"Failed to generate audio. {response}")
293
+
294
+ def _http_url(self):
295
+ prefix = "http" if "localhost" in self.base_url else "https"
296
+ return f"{prefix}://{self.base_url}/{self.api_version}"
297
+
298
+ def _ws_url(self):
299
+ prefix = "ws" if "localhost" in self.base_url else "wss"
300
+ return f"{prefix}://{self.base_url}/{self.api_version}"
301
+
302
+ def __del__(self):
303
+ if self.websocket.socket.fileno() > -1:
304
+ self.websocket.close()
cartesia/version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.0.2"
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.1
2
+ Name: cartesia
3
+ Version: 0.0.2
4
+ Summary: The official Python library for the Cartesia API.
5
+ Home-page:
6
+ Author: Cartesia, Inc.
7
+ Author-email: support@cartesia.ai
8
+ Classifier: Programming Language :: Python
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Requires-Python: >=3.8.0
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: websockets
14
+ Requires-Dist: requests
15
+ Requires-Dist: numpy
16
+ Provides-Extra: all
17
+ Requires-Dist: pre-commit ; extra == 'all'
18
+ Requires-Dist: docformatter ; extra == 'all'
19
+ Requires-Dist: black ==24.1.1 ; extra == 'all'
20
+ Requires-Dist: isort ==5.13.2 ; extra == 'all'
21
+ Requires-Dist: flake8 ==7.0.0 ; extra == 'all'
22
+ Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'all'
23
+ Requires-Dist: pytest >=8.0.2 ; extra == 'all'
24
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'all'
25
+ Provides-Extra: dev
26
+ Requires-Dist: pre-commit ; extra == 'dev'
27
+ Requires-Dist: docformatter ; extra == 'dev'
28
+ Requires-Dist: black ==24.1.1 ; extra == 'dev'
29
+ Requires-Dist: isort ==5.13.2 ; extra == 'dev'
30
+ Requires-Dist: flake8 ==7.0.0 ; extra == 'dev'
31
+ Requires-Dist: flake8-bugbear ==24.2.6 ; extra == 'dev'
32
+ Requires-Dist: pytest >=8.0.2 ; extra == 'dev'
33
+ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'dev'
34
+
35
+
36
+ # Cartesia Python API Library
37
+ The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
38
+
39
+ **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
40
+
41
+ ## Installation
42
+ ```bash
43
+ pip install cartesia
44
+
45
+ # pip install in editable mode w/ dev dependencies
46
+ pip install -e '.[dev]'
47
+ ```
48
+
49
+ ## Usage
50
+ ```python
51
+ from cartesia.tts import CartesiaTTS
52
+ from IPython.display import Audio
53
+
54
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
+
56
+ voices = client.get_voices()
57
+ embedding = voices["Milo"]["embedding"]
58
+ transcript = "Hello! Welcome to Cartesia"
59
+
60
+ # No streaming
61
+ output = client.generate(transcript=transcript, voice=embedding)
62
+ Audio(output["audio"], rate=output["sampling_rate"])
63
+
64
+ # Streaming
65
+ for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
+ arr = output["audio"] # a numpy array
67
+ rate = output["sampling_rate"]
68
+ ```
69
+
70
+ We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -0,0 +1,7 @@
1
+ cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
+ cartesia/tts.py,sha256=kQjkQhWfVrLFH6yaOb6G65HXtFDzPwLa6Q9AwVgIyCI,11901
3
+ cartesia/version.py,sha256=QvlVh4JTl3JL7jQAja76yKtT-IvF4631ASjWY1wS6AQ,22
4
+ cartesia-0.0.2.dist-info/METADATA,sha256=7BcDRyB4vxCWxcJhTbe_cWEQXCNOiEJdqoWh8WouNGs,2465
5
+ cartesia-0.0.2.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
+ cartesia-0.0.2.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
+ cartesia-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.41.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
6
+
@@ -0,0 +1 @@
1
+ cartesia