cartesia 0.0.2__py2.py3-none-any.whl → 0.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/tts.py CHANGED
@@ -4,7 +4,6 @@ import os
4
4
  import uuid
5
5
  from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
6
 
7
- import numpy as np
8
7
  import requests
9
8
  from websockets.sync.client import connect
10
9
 
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
14
13
 
15
14
 
16
15
  class AudioOutput(TypedDict):
17
- audio: np.ndarray
16
+ audio: bytes
18
17
  sampling_rate: int
19
18
 
20
19
 
@@ -176,13 +175,26 @@ class CartesiaTTS:
176
175
  def _is_websocket_closed(self):
177
176
  return self.websocket.socket.fileno() == -1
178
177
 
178
+ def _check_inputs(
179
+ self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
180
+ ):
181
+ if chunk_time is not None:
182
+ if chunk_time < 0.1 or chunk_time > 0.5:
183
+ raise ValueError("`chunk_time` must be between 0.1 and 0.5")
184
+
185
+ if chunk_time is not None and duration is not None:
186
+ if duration < chunk_time:
187
+ raise ValueError("`duration` must be greater than chunk_time")
188
+
189
+ if transcript.strip() == "":
190
+ raise ValueError("`transcript` must be non empty")
191
+
179
192
  def generate(
180
193
  self,
181
194
  *,
182
195
  transcript: str,
183
196
  duration: int = None,
184
197
  chunk_time: float = None,
185
- lookahead: int = None,
186
198
  voice: Embedding = None,
187
199
  stream: bool = False,
188
200
  websocket: bool = True,
@@ -194,8 +206,6 @@ class CartesiaTTS:
194
206
  duration: The maximum duration of the audio in seconds.
195
207
  chunk_time: How long each audio segment should be in seconds.
196
208
  This should not need to be adjusted.
197
- lookahead: The number of seconds to look ahead for each chunk.
198
- This should not need to be adjusted.
199
209
  voice: The voice to use for generating audio.
200
210
  This can either be a voice id (string) or an embedding vector (List[float]).
201
211
  stream: Whether to stream the audio or not.
@@ -206,18 +216,16 @@ class CartesiaTTS:
206
216
  Returns:
207
217
  A generator if `stream` is True, otherwise a dictionary.
208
218
  Dictionary from both generator and non-generator return types have the following keys:
209
- * "audio": The audio as a 1D numpy array.
219
+ * "audio": The audio as a bytes buffer.
210
220
  * "sampling_rate": The sampling rate of the audio.
211
221
  """
212
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
222
+ self._check_inputs(transcript, duration, chunk_time)
213
223
 
214
- if isinstance(voice, str):
215
- voice = self._voices[voice]
224
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
216
225
 
217
226
  optional_body = dict(
218
227
  duration=duration,
219
228
  chunk_time=chunk_time,
220
- lookahead=lookahead,
221
229
  voice=voice,
222
230
  )
223
231
  body.update({k: v for k, v in optional_body.items() if v is not None})
@@ -237,7 +245,7 @@ class CartesiaTTS:
237
245
  sampling_rate = chunk["sampling_rate"]
238
246
  chunks.append(chunk["audio"])
239
247
 
240
- return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
248
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
241
249
 
242
250
  def _generate_http(self, body: Dict[str, Any]):
243
251
  response = requests.post(
@@ -258,8 +266,7 @@ class CartesiaTTS:
258
266
  if start_index != -1 and end_index != -1:
259
267
  try:
260
268
  chunk_json = json.loads(buffer[start_index : end_index + 1])
261
- data = base64.b64decode(chunk_json["data"])
262
- audio = np.frombuffer(data, dtype=np.float32)
269
+ audio = base64.b64decode(chunk_json["data"])
263
270
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
264
271
  buffer = buffer[end_index + 1 :]
265
272
  except json.JSONDecodeError:
@@ -268,8 +275,7 @@ class CartesiaTTS:
268
275
  if buffer:
269
276
  try:
270
277
  chunk_json = json.loads(buffer)
271
- data = base64.b64decode(chunk_json["data"])
272
- audio = np.frombuffer(data, dtype=np.float32)
278
+ audio = base64.b64decode(chunk_json["data"])
273
279
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
274
280
  except json.JSONDecodeError:
275
281
  pass
@@ -282,8 +288,7 @@ class CartesiaTTS:
282
288
  try:
283
289
  response = json.loads(self.websocket.recv())
284
290
  while not response["done"]:
285
- data = base64.b64decode(response["data"])
286
- audio = np.frombuffer(data, dtype=np.float32)
291
+ audio = base64.b64decode(response["data"])
287
292
  # print("timing", time.perf_counter() - start)
288
293
  yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
289
294
 
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.2"
1
+ __version__ = "0.0.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: websockets
14
14
  Requires-Dist: requests
15
- Requires-Dist: numpy
16
15
  Provides-Extra: all
17
16
  Requires-Dist: pre-commit ; extra == 'all'
18
17
  Requires-Dist: docformatter ; extra == 'all'
@@ -49,22 +48,66 @@ pip install -e '.[dev]'
49
48
  ## Usage
50
49
  ```python
51
50
  from cartesia.tts import CartesiaTTS
52
- from IPython.display import Audio
51
+ import pyaudio
52
+ import os
53
53
 
54
54
  client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
-
56
55
  voices = client.get_voices()
57
- embedding = voices["Milo"]["embedding"]
56
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
58
57
  transcript = "Hello! Welcome to Cartesia"
59
58
 
60
- # No streaming
61
- output = client.generate(transcript=transcript, voice=embedding)
62
- Audio(output["audio"], rate=output["sampling_rate"])
59
+ p = pyaudio.PyAudio()
60
+
61
+ stream = None
63
62
 
64
- # Streaming
65
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
- arr = output["audio"] # a numpy array
63
+ # Generate and stream audio
64
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
+ buffer = output["audio"]
67
66
  rate = output["sampling_rate"]
67
+
68
+ if not stream:
69
+ stream = p.open(format=pyaudio.paFloat32,
70
+ channels=1,
71
+ rate=rate,
72
+ output=True)
73
+
74
+ # Write the audio data to the stream
75
+ stream.write(buffer)
76
+
77
+ stream.stop_stream()
78
+ stream.close()
79
+ p.terminate()
80
+ ```
81
+
82
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
+
84
+ ```python
85
+ from cartesia.tts import CartesiaTTS
86
+ from IPython.display import Audio
87
+ import io
88
+ import os
89
+
90
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
+ voices = client.get_voices()
92
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
+ transcript = "Hello! Welcome to Cartesia"
94
+
95
+ # Create a BytesIO object to store the audio data
96
+ audio_data = io.BytesIO()
97
+
98
+ # Generate and stream audio
99
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
+ buffer = output["audio"]
101
+ audio_data.write(buffer)
102
+
103
+ # Set the cursor position to the beginning of the BytesIO object
104
+ audio_data.seek(0)
105
+
106
+ # Create an Audio object from the BytesIO data
107
+ audio = Audio(audio_data, rate=output["sampling_rate"])
108
+
109
+ # Display the Audio object
110
+ display(audio)
68
111
  ```
69
112
 
70
113
  We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -0,0 +1,7 @@
1
+ cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
+ cartesia/tts.py,sha256=ABXW9rc8Pn0GTRvb_7DHZKMtbvhGUiqOgHmvztwlOnI,12033
3
+ cartesia/version.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
4
+ cartesia-0.0.3.dist-info/METADATA,sha256=VsCGL1sITbKqERihK2rzVm9WIY5EJ5nCS_CXQ0s14ns,3604
5
+ cartesia-0.0.3.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
+ cartesia-0.0.3.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
+ cartesia-0.0.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- cartesia/__init__.py,sha256=m8BX-qLjsMoI_JZtgf3jNi8R3cBZqYy-z4oEhYeJLdI,64
2
- cartesia/tts.py,sha256=kQjkQhWfVrLFH6yaOb6G65HXtFDzPwLa6Q9AwVgIyCI,11901
3
- cartesia/version.py,sha256=QvlVh4JTl3JL7jQAja76yKtT-IvF4631ASjWY1wS6AQ,22
4
- cartesia-0.0.2.dist-info/METADATA,sha256=7BcDRyB4vxCWxcJhTbe_cWEQXCNOiEJdqoWh8WouNGs,2465
5
- cartesia-0.0.2.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
6
- cartesia-0.0.2.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
7
- cartesia-0.0.2.dist-info/RECORD,,