cartesia 0.0.2__py2.py3-none-any.whl → 0.0.4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/tts.py CHANGED
@@ -4,7 +4,6 @@ import os
4
4
  import uuid
5
5
  from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
6
 
7
- import numpy as np
8
7
  import requests
9
8
  from websockets.sync.client import connect
10
9
 
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
14
13
 
15
14
 
16
15
  class AudioOutput(TypedDict):
17
- audio: np.ndarray
16
+ audio: bytes
18
17
  sampling_rate: int
19
18
 
20
19
 
@@ -32,7 +31,11 @@ class CartesiaTTS:
32
31
  """The client for Cartesia's text-to-speech library.
33
32
 
34
33
  This client contains methods to interact with the Cartesia text-to-speech API.
35
- The API offers
34
+ The client can be used to retrieve available voices, compute new voice embeddings,
35
+ and generate speech from text.
36
+
37
+ The client also supports generating audio using a websocket for lower latency.
38
+ To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
36
39
 
37
40
  Examples:
38
41
 
@@ -56,18 +59,22 @@ class CartesiaTTS:
56
59
  ... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
57
60
  """
58
61
 
59
- def __init__(self, *, api_key: str = None):
62
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
60
63
  """
61
64
  Args:
62
65
  api_key: The API key to use for authorization.
63
66
  If not specified, the API key will be read from the environment variable
64
67
  `CARTESIA_API_KEY`.
68
+ experimental_ws_handle_interrupts: Whether to handle interrupts when generating
69
+ audio using the websocket. This is an experimental feature and may have bugs
70
+ or be deprecated in the future.
65
71
  """
66
72
  self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
67
73
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
68
74
  self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
69
75
  self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
70
76
  self.websocket = None
77
+ self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
71
78
  self.refresh_websocket()
72
79
 
73
80
  def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
@@ -168,21 +175,37 @@ class CartesiaTTS:
168
175
  """
169
176
  if self.websocket and not self._is_websocket_closed():
170
177
  self.websocket.close()
178
+ route = "audio/websocket"
179
+ if self.experimental_ws_handle_interrupts:
180
+ route = f"experimental/{route}"
171
181
  self.websocket = connect(
172
- f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
182
+ f"{self._ws_url()}/{route}?api_key={self.api_key}",
173
183
  close_timeout=None,
174
184
  )
175
185
 
176
186
  def _is_websocket_closed(self):
177
187
  return self.websocket.socket.fileno() == -1
178
188
 
189
+ def _check_inputs(
190
+ self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
191
+ ):
192
+ if chunk_time is not None:
193
+ if chunk_time < 0.1 or chunk_time > 0.5:
194
+ raise ValueError("`chunk_time` must be between 0.1 and 0.5")
195
+
196
+ if chunk_time is not None and duration is not None:
197
+ if duration < chunk_time:
198
+ raise ValueError("`duration` must be greater than chunk_time")
199
+
200
+ if transcript.strip() == "":
201
+ raise ValueError("`transcript` must be non empty")
202
+
179
203
  def generate(
180
204
  self,
181
205
  *,
182
206
  transcript: str,
183
207
  duration: int = None,
184
208
  chunk_time: float = None,
185
- lookahead: int = None,
186
209
  voice: Embedding = None,
187
210
  stream: bool = False,
188
211
  websocket: bool = True,
@@ -194,8 +217,6 @@ class CartesiaTTS:
194
217
  duration: The maximum duration of the audio in seconds.
195
218
  chunk_time: How long each audio segment should be in seconds.
196
219
  This should not need to be adjusted.
197
- lookahead: The number of seconds to look ahead for each chunk.
198
- This should not need to be adjusted.
199
220
  voice: The voice to use for generating audio.
200
221
  This can either be a voice id (string) or an embedding vector (List[float]).
201
222
  stream: Whether to stream the audio or not.
@@ -206,18 +227,16 @@ class CartesiaTTS:
206
227
  Returns:
207
228
  A generator if `stream` is True, otherwise a dictionary.
208
229
  Dictionary from both generator and non-generator return types have the following keys:
209
- * "audio": The audio as a 1D numpy array.
230
+ * "audio": The audio as a bytes buffer.
210
231
  * "sampling_rate": The sampling rate of the audio.
211
232
  """
212
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
233
+ self._check_inputs(transcript, duration, chunk_time)
213
234
 
214
- if isinstance(voice, str):
215
- voice = self._voices[voice]
235
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
216
236
 
217
237
  optional_body = dict(
218
238
  duration=duration,
219
239
  chunk_time=chunk_time,
220
- lookahead=lookahead,
221
240
  voice=voice,
222
241
  )
223
242
  body.update({k: v for k, v in optional_body.items() if v is not None})
@@ -237,7 +256,7 @@ class CartesiaTTS:
237
256
  sampling_rate = chunk["sampling_rate"]
238
257
  chunks.append(chunk["audio"])
239
258
 
240
- return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
259
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
241
260
 
242
261
  def _generate_http(self, body: Dict[str, Any]):
243
262
  response = requests.post(
@@ -258,8 +277,7 @@ class CartesiaTTS:
258
277
  if start_index != -1 and end_index != -1:
259
278
  try:
260
279
  chunk_json = json.loads(buffer[start_index : end_index + 1])
261
- data = base64.b64decode(chunk_json["data"])
262
- audio = np.frombuffer(data, dtype=np.float32)
280
+ audio = base64.b64decode(chunk_json["data"])
263
281
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
264
282
  buffer = buffer[end_index + 1 :]
265
283
  except json.JSONDecodeError:
@@ -268,28 +286,55 @@ class CartesiaTTS:
268
286
  if buffer:
269
287
  try:
270
288
  chunk_json = json.loads(buffer)
271
- data = base64.b64decode(chunk_json["data"])
272
- audio = np.frombuffer(data, dtype=np.float32)
289
+ audio = base64.b64decode(chunk_json["data"])
273
290
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
274
291
  except json.JSONDecodeError:
275
292
  pass
276
293
 
277
- def _generate_ws(self, body: Dict[str, Any]):
294
+ def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
295
+ """Generate audio using the websocket connection.
296
+
297
+ Args:
298
+ body: The request body.
299
+ context_id: The context id for the request.
300
+ The context id must be globally unique for the duration this client exists.
301
+ If this is provided, the context id that is in the response will
302
+ also be returned as part of the dict. This is helpful for testing.
303
+ """
278
304
  if not self.websocket or self._is_websocket_closed():
279
305
  self.refresh_websocket()
280
306
 
281
- self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
307
+ include_context_id = bool(context_id)
308
+ if context_id is None:
309
+ context_id = uuid.uuid4().hex
310
+ self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
282
311
  try:
283
- response = json.loads(self.websocket.recv())
284
- while not response["done"]:
285
- data = base64.b64decode(response["data"])
286
- audio = np.frombuffer(data, dtype=np.float32)
287
- # print("timing", time.perf_counter() - start)
288
- yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
289
-
312
+ while True:
290
313
  response = json.loads(self.websocket.recv())
291
- except Exception:
292
- raise RuntimeError(f"Failed to generate audio. {response}")
314
+ if response["done"]:
315
+ break
316
+ audio = base64.b64decode(response["data"])
317
+
318
+ optional_kwargs = {}
319
+ if include_context_id:
320
+ optional_kwargs["context_id"] = response["context_id"]
321
+
322
+ yield {
323
+ "audio": audio,
324
+ "sampling_rate": response["sampling_rate"],
325
+ **optional_kwargs,
326
+ }
327
+
328
+ if self.experimental_ws_handle_interrupts:
329
+ self.websocket.send(json.dumps({"context_id": context_id}))
330
+ except GeneratorExit:
331
+ # The exit is only called when the generator is garbage collected.
332
+ # It may not be called directly after a break statement.
333
+ # However, the generator will be automatically cancelled on the next request.
334
+ if self.experimental_ws_handle_interrupts:
335
+ self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
336
+ except Exception as e:
337
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
293
338
 
294
339
  def _http_url(self):
295
340
  prefix = "http" if "localhost" in self.base_url else "https"
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.2"
1
+ __version__ = "0.0.4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: websockets
14
14
  Requires-Dist: requests
15
- Requires-Dist: numpy
16
15
  Provides-Extra: all
17
16
  Requires-Dist: pre-commit ; extra == 'all'
18
17
  Requires-Dist: docformatter ; extra == 'all'
@@ -49,22 +48,68 @@ pip install -e '.[dev]'
49
48
  ## Usage
50
49
  ```python
51
50
  from cartesia.tts import CartesiaTTS
52
- from IPython.display import Audio
51
+ import pyaudio
52
+ import os
53
53
 
54
54
  client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
-
56
55
  voices = client.get_voices()
57
- embedding = voices["Milo"]["embedding"]
56
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
58
57
  transcript = "Hello! Welcome to Cartesia"
59
58
 
60
- # No streaming
61
- output = client.generate(transcript=transcript, voice=embedding)
62
- Audio(output["audio"], rate=output["sampling_rate"])
59
+ p = pyaudio.PyAudio()
60
+
61
+ stream = None
63
62
 
64
- # Streaming
65
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
- arr = output["audio"] # a numpy array
63
+ # Generate and stream audio
64
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
+ buffer = output["audio"]
67
66
  rate = output["sampling_rate"]
67
+
68
+ if not stream:
69
+ stream = p.open(format=pyaudio.paFloat32,
70
+ channels=1,
71
+ rate=rate,
72
+ output=True)
73
+
74
+ # Write the audio data to the stream
75
+ stream.write(buffer)
76
+
77
+ stream.stop_stream()
78
+ stream.close()
79
+ p.terminate()
80
+ ```
81
+
82
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
+
84
+ ```python
85
+ from cartesia.tts import CartesiaTTS
86
+ from IPython.display import Audio
87
+ import io
88
+ import os
89
+
90
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
+ voices = client.get_voices()
92
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
+ transcript = "Hello! Welcome to Cartesia"
94
+
95
+ # Create a BytesIO object to store the audio data
96
+ audio_data = io.BytesIO()
97
+
98
+ # Generate and stream audio
99
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
+ buffer = output["audio"]
101
+ audio_data.write(buffer)
102
+
103
+ # Set the cursor position to the beginning of the BytesIO object
104
+ audio_data.seek(0)
105
+
106
+ # Create an Audio object from the BytesIO data
107
+ audio = Audio(audio_data, rate=output["sampling_rate"])
108
+
109
+ # Display the Audio object
110
+ display(audio)
68
111
  ```
69
112
 
70
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
113
+ To avoid storing your API key in the source code, we recommend doing one of the following:
114
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
115
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -0,0 +1,7 @@
1
+ cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
+ cartesia/tts.py,sha256=9m9_kqscMY0yzUU0Ty5k2HoeMqfrIbHouaS-ymcr64s,14127
3
+ cartesia/version.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
4
+ cartesia-0.0.4.dist-info/METADATA,sha256=tLUrKLREJiXrW-pfd3k61i9CnElKHk5RAyidCMxpR-s,3752
5
+ cartesia-0.0.4.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
+ cartesia-0.0.4.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
+ cartesia-0.0.4.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
- cartesia/tts.py,sha256=kQjkQhWfVrLFH6yaOb6G65HXtFDzPwLa6Q9AwVgIyCI,11901
3
- cartesia/version.py,sha256=QvlVh4JTl3JL7jQAja76yKtT-IvF4631ASjWY1wS6AQ,22
4
- cartesia-0.0.2.dist-info/METADATA,sha256=7BcDRyB4vxCWxcJhTbe_cWEQXCNOiEJdqoWh8WouNGs,2465
5
- cartesia-0.0.2.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
- cartesia-0.0.2.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
- cartesia-0.0.2.dist-info/RECORD,,