cartesia 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: websockets
14
14
  Requires-Dist: requests
15
- Requires-Dist: numpy
16
15
  Provides-Extra: dev
17
16
  Requires-Dist: pre-commit; extra == "dev"
18
17
  Requires-Dist: docformatter; extra == "dev"
@@ -49,22 +48,66 @@ pip install -e '.[dev]'
49
48
  ## Usage
50
49
  ```python
51
50
  from cartesia.tts import CartesiaTTS
52
- from IPython.display import Audio
51
+ import pyaudio
52
+ import os
53
53
 
54
54
  client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
-
56
55
  voices = client.get_voices()
57
- embedding = voices["Milo"]["embedding"]
56
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
58
57
  transcript = "Hello! Welcome to Cartesia"
59
58
 
60
- # No streaming
61
- output = client.generate(transcript=transcript, voice=embedding)
62
- Audio(output["audio"], rate=output["sampling_rate"])
59
+ p = pyaudio.PyAudio()
60
+
61
+ stream = None
63
62
 
64
- # Streaming
65
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
- arr = output["audio"] # a numpy array
63
+ # Generate and stream audio
64
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
+ buffer = output["audio"]
67
66
  rate = output["sampling_rate"]
67
+
68
+ if not stream:
69
+ stream = p.open(format=pyaudio.paFloat32,
70
+ channels=1,
71
+ rate=rate,
72
+ output=True)
73
+
74
+ # Write the audio data to the stream
75
+ stream.write(buffer)
76
+
77
+ stream.stop_stream()
78
+ stream.close()
79
+ p.terminate()
80
+ ```
81
+
82
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
+
84
+ ```python
85
+ from cartesia.tts import CartesiaTTS
86
+ from IPython.display import Audio
87
+ import io
88
+ import os
89
+
90
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
+ voices = client.get_voices()
92
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
+ transcript = "Hello! Welcome to Cartesia"
94
+
95
+ # Create a BytesIO object to store the audio data
96
+ audio_data = io.BytesIO()
97
+
98
+ # Generate and stream audio
99
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
+ buffer = output["audio"]
101
+ audio_data.write(buffer)
102
+
103
+ # Set the cursor position to the beginning of the BytesIO object
104
+ audio_data.seek(0)
105
+
106
+ # Create an Audio object from the BytesIO data
107
+ audio = Audio(audio_data, rate=output["sampling_rate"])
108
+
109
+ # Display the Audio object
110
+ display(audio)
68
111
  ```
69
112
 
70
113
  We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -0,0 +1,79 @@
1
+ # Cartesia Python API Library
2
+ The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
3
+
4
+ **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
5
+
6
+ ## Installation
7
+ ```bash
8
+ pip install cartesia
9
+
10
+ # pip install in editable mode w/ dev dependencies
11
+ pip install -e '.[dev]'
12
+ ```
13
+
14
+ ## Usage
15
+ ```python
16
+ from cartesia.tts import CartesiaTTS
17
+ import pyaudio
18
+ import os
19
+
20
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
21
+ voices = client.get_voices()
22
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
23
+ transcript = "Hello! Welcome to Cartesia"
24
+
25
+ p = pyaudio.PyAudio()
26
+
27
+ stream = None
28
+
29
+ # Generate and stream audio
30
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
31
+ buffer = output["audio"]
32
+ rate = output["sampling_rate"]
33
+
34
+ if not stream:
35
+ stream = p.open(format=pyaudio.paFloat32,
36
+ channels=1,
37
+ rate=rate,
38
+ output=True)
39
+
40
+ # Write the audio data to the stream
41
+ stream.write(buffer)
42
+
43
+ stream.stop_stream()
44
+ stream.close()
45
+ p.terminate()
46
+ ```
47
+
48
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
49
+
50
+ ```python
51
+ from cartesia.tts import CartesiaTTS
52
+ from IPython.display import Audio
53
+ import io
54
+ import os
55
+
56
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
57
+ voices = client.get_voices()
58
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
59
+ transcript = "Hello! Welcome to Cartesia"
60
+
61
+ # Create a BytesIO object to store the audio data
62
+ audio_data = io.BytesIO()
63
+
64
+ # Generate and stream audio
65
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
66
+ buffer = output["audio"]
67
+ audio_data.write(buffer)
68
+
69
+ # Set the cursor position to the beginning of the BytesIO object
70
+ audio_data.seek(0)
71
+
72
+ # Create an Audio object from the BytesIO data
73
+ audio = Audio(audio_data, rate=output["sampling_rate"])
74
+
75
+ # Display the Audio object
76
+ display(audio)
77
+ ```
78
+
79
+ We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -4,7 +4,6 @@ import os
4
4
  import uuid
5
5
  from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
6
 
7
- import numpy as np
8
7
  import requests
9
8
  from websockets.sync.client import connect
10
9
 
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
14
13
 
15
14
 
16
15
  class AudioOutput(TypedDict):
17
- audio: np.ndarray
16
+ audio: bytes
18
17
  sampling_rate: int
19
18
 
20
19
 
@@ -176,13 +175,26 @@ class CartesiaTTS:
176
175
  def _is_websocket_closed(self):
177
176
  return self.websocket.socket.fileno() == -1
178
177
 
178
+ def _check_inputs(
179
+ self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
180
+ ):
181
+ if chunk_time is not None:
182
+ if chunk_time < 0.1 or chunk_time > 0.5:
183
+ raise ValueError("`chunk_time` must be between 0.1 and 0.5")
184
+
185
+ if chunk_time is not None and duration is not None:
186
+ if duration < chunk_time:
187
+ raise ValueError("`duration` must be greater than chunk_time")
188
+
189
+ if transcript.strip() == "":
190
+ raise ValueError("`transcript` must be non empty")
191
+
179
192
  def generate(
180
193
  self,
181
194
  *,
182
195
  transcript: str,
183
196
  duration: int = None,
184
197
  chunk_time: float = None,
185
- lookahead: int = None,
186
198
  voice: Embedding = None,
187
199
  stream: bool = False,
188
200
  websocket: bool = True,
@@ -194,8 +206,6 @@ class CartesiaTTS:
194
206
  duration: The maximum duration of the audio in seconds.
195
207
  chunk_time: How long each audio segment should be in seconds.
196
208
  This should not need to be adjusted.
197
- lookahead: The number of seconds to look ahead for each chunk.
198
- This should not need to be adjusted.
199
209
  voice: The voice to use for generating audio.
200
210
  This can either be a voice id (string) or an embedding vector (List[float]).
201
211
  stream: Whether to stream the audio or not.
@@ -206,18 +216,16 @@ class CartesiaTTS:
206
216
  Returns:
207
217
  A generator if `stream` is True, otherwise a dictionary.
208
218
  Dictionary from both generator and non-generator return types have the following keys:
209
- * "audio": The audio as a 1D numpy array.
219
+ * "audio": The audio as a bytes buffer.
210
220
  * "sampling_rate": The sampling rate of the audio.
211
221
  """
212
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
222
+ self._check_inputs(transcript, duration, chunk_time)
213
223
 
214
- if isinstance(voice, str):
215
- voice = self._voices[voice]
224
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
216
225
 
217
226
  optional_body = dict(
218
227
  duration=duration,
219
228
  chunk_time=chunk_time,
220
- lookahead=lookahead,
221
229
  voice=voice,
222
230
  )
223
231
  body.update({k: v for k, v in optional_body.items() if v is not None})
@@ -237,7 +245,7 @@ class CartesiaTTS:
237
245
  sampling_rate = chunk["sampling_rate"]
238
246
  chunks.append(chunk["audio"])
239
247
 
240
- return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
248
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
241
249
 
242
250
  def _generate_http(self, body: Dict[str, Any]):
243
251
  response = requests.post(
@@ -258,8 +266,7 @@ class CartesiaTTS:
258
266
  if start_index != -1 and end_index != -1:
259
267
  try:
260
268
  chunk_json = json.loads(buffer[start_index : end_index + 1])
261
- data = base64.b64decode(chunk_json["data"])
262
- audio = np.frombuffer(data, dtype=np.float32)
269
+ audio = base64.b64decode(chunk_json["data"])
263
270
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
264
271
  buffer = buffer[end_index + 1 :]
265
272
  except json.JSONDecodeError:
@@ -268,8 +275,7 @@ class CartesiaTTS:
268
275
  if buffer:
269
276
  try:
270
277
  chunk_json = json.loads(buffer)
271
- data = base64.b64decode(chunk_json["data"])
272
- audio = np.frombuffer(data, dtype=np.float32)
278
+ audio = base64.b64decode(chunk_json["data"])
273
279
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
274
280
  except json.JSONDecodeError:
275
281
  pass
@@ -282,8 +288,7 @@ class CartesiaTTS:
282
288
  try:
283
289
  response = json.loads(self.websocket.recv())
284
290
  while not response["done"]:
285
- data = base64.b64decode(response["data"])
286
- audio = np.frombuffer(data, dtype=np.float32)
291
+ audio = base64.b64decode(response["data"])
287
292
  # print("timing", time.perf_counter() - start)
288
293
  yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
289
294
 
@@ -0,0 +1 @@
1
+ __version__ = "0.0.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: websockets
14
14
  Requires-Dist: requests
15
- Requires-Dist: numpy
16
15
  Provides-Extra: dev
17
16
  Requires-Dist: pre-commit; extra == "dev"
18
17
  Requires-Dist: docformatter; extra == "dev"
@@ -49,22 +48,66 @@ pip install -e '.[dev]'
49
48
  ## Usage
50
49
  ```python
51
50
  from cartesia.tts import CartesiaTTS
52
- from IPython.display import Audio
51
+ import pyaudio
52
+ import os
53
53
 
54
54
  client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
-
56
55
  voices = client.get_voices()
57
- embedding = voices["Milo"]["embedding"]
56
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
58
57
  transcript = "Hello! Welcome to Cartesia"
59
58
 
60
- # No streaming
61
- output = client.generate(transcript=transcript, voice=embedding)
62
- Audio(output["audio"], rate=output["sampling_rate"])
59
+ p = pyaudio.PyAudio()
60
+
61
+ stream = None
63
62
 
64
- # Streaming
65
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
- arr = output["audio"] # a numpy array
63
+ # Generate and stream audio
64
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
+ buffer = output["audio"]
67
66
  rate = output["sampling_rate"]
67
+
68
+ if not stream:
69
+ stream = p.open(format=pyaudio.paFloat32,
70
+ channels=1,
71
+ rate=rate,
72
+ output=True)
73
+
74
+ # Write the audio data to the stream
75
+ stream.write(buffer)
76
+
77
+ stream.stop_stream()
78
+ stream.close()
79
+ p.terminate()
80
+ ```
81
+
82
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
+
84
+ ```python
85
+ from cartesia.tts import CartesiaTTS
86
+ from IPython.display import Audio
87
+ import io
88
+ import os
89
+
90
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
+ voices = client.get_voices()
92
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
+ transcript = "Hello! Welcome to Cartesia"
94
+
95
+ # Create a BytesIO object to store the audio data
96
+ audio_data = io.BytesIO()
97
+
98
+ # Generate and stream audio
99
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
+ buffer = output["audio"]
101
+ audio_data.write(buffer)
102
+
103
+ # Set the cursor position to the beginning of the BytesIO object
104
+ audio_data.seek(0)
105
+
106
+ # Create an Audio object from the BytesIO data
107
+ audio = Audio(audio_data, rate=output["sampling_rate"])
108
+
109
+ # Display the Audio object
110
+ display(audio)
68
111
  ```
69
112
 
70
113
  We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -1,6 +1,5 @@
1
1
  websockets
2
2
  requests
3
- numpy
4
3
 
5
4
  [all]
6
5
  pre-commit
@@ -8,7 +8,6 @@ but rather for general correctness.
8
8
  import os
9
9
  from typing import Dict, Generator
10
10
 
11
- import numpy as np
12
11
  import pytest
13
12
 
14
13
  from cartesia.tts import CartesiaTTS, VoiceMetadata
@@ -72,8 +71,7 @@ def test_generate(resources: _Resources, websocket: bool):
72
71
 
73
72
  output = client.generate(transcript=transcript, voice=embedding, websocket=websocket)
74
73
  assert output.keys() == {"audio", "sampling_rate"}
75
- assert isinstance(output["audio"], np.ndarray)
76
- assert output["audio"].dtype == np.float32
74
+ assert isinstance(output["audio"], bytes)
77
75
  assert isinstance(output["sampling_rate"], int)
78
76
 
79
77
 
@@ -91,6 +89,45 @@ def test_generate_stream(resources: _Resources, websocket: bool):
91
89
 
92
90
  for output in generator:
93
91
  assert output.keys() == {"audio", "sampling_rate"}
94
- assert isinstance(output["audio"], np.ndarray)
95
- assert output["audio"].dtype == np.float32
92
+ assert isinstance(output["audio"], bytes)
96
93
  assert isinstance(output["sampling_rate"], int)
94
+
95
+
96
+ @pytest.mark.parametrize("chunk_time", [0.05, 0.6])
97
+ def test_check_inputs_invalid_chunk_time(client: CartesiaTTS, chunk_time):
98
+ with pytest.raises(ValueError, match="`chunk_time` must be between 0.1 and 0.5"):
99
+ client._check_inputs("Test", None, chunk_time)
100
+
101
+
102
+ @pytest.mark.parametrize("chunk_time", [0.1, 0.3, 0.5])
103
+ def test_check_inputs_valid_chunk_time(client, chunk_time):
104
+ try:
105
+ client._check_inputs("Test", None, chunk_time)
106
+ except ValueError:
107
+ pytest.fail("Unexpected ValueError raised")
108
+
109
+
110
+ def test_check_inputs_duration_less_than_chunk_time(client: CartesiaTTS):
111
+ with pytest.raises(ValueError, match="`duration` must be greater than chunk_time"):
112
+ client._check_inputs("Test", 0.2, 0.3)
113
+
114
+
115
+ @pytest.mark.parametrize("duration,chunk_time", [(0.5, 0.2), (1.0, 0.5), (2.0, 0.1)])
116
+ def test_check_inputs_valid_duration_and_chunk_time(client: CartesiaTTS, duration, chunk_time):
117
+ try:
118
+ client._check_inputs("Test", duration, chunk_time)
119
+ except ValueError:
120
+ pytest.fail("Unexpected ValueError raised")
121
+
122
+
123
+ def test_check_inputs_empty_transcript(client: CartesiaTTS):
124
+ with pytest.raises(ValueError, match="`transcript` must be non empty"):
125
+ client._check_inputs("", None, None)
126
+
127
+
128
+ @pytest.mark.parametrize("transcript", ["Hello", "Test transcript", "Lorem ipsum dolor sit amet"])
129
+ def test_check_inputs_valid_transcript(client: CartesiaTTS, transcript):
130
+ try:
131
+ client._check_inputs(transcript, None, None)
132
+ except ValueError:
133
+ pytest.fail("Unexpected ValueError raised")
cartesia-0.0.2/README.md DELETED
@@ -1,35 +0,0 @@
1
- # Cartesia Python API Library
2
- The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
3
-
4
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
5
-
6
- ## Installation
7
- ```bash
8
- pip install cartesia
9
-
10
- # pip install in editable mode w/ dev dependencies
11
- pip install -e '.[dev]'
12
- ```
13
-
14
- ## Usage
15
- ```python
16
- from cartesia.tts import CartesiaTTS
17
- from IPython.display import Audio
18
-
19
- client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
20
-
21
- voices = client.get_voices()
22
- embedding = voices["Milo"]["embedding"]
23
- transcript = "Hello! Welcome to Cartesia"
24
-
25
- # No streaming
26
- output = client.generate(transcript=transcript, voice=embedding)
27
- Audio(output["audio"], rate=output["sampling_rate"])
28
-
29
- # Streaming
30
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
31
- arr = output["audio"] # a numpy array
32
- rate = output["sampling_rate"]
33
- ```
34
-
35
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -1 +0,0 @@
1
- __version__ = "0.0.2"
File without changes
File without changes
File without changes
File without changes