cartesia 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: websockets
14
14
  Requires-Dist: requests
15
- Requires-Dist: numpy
16
15
  Provides-Extra: dev
17
16
  Requires-Dist: pre-commit; extra == "dev"
18
17
  Requires-Dist: docformatter; extra == "dev"
@@ -49,22 +48,68 @@ pip install -e '.[dev]'
49
48
  ## Usage
50
49
  ```python
51
50
  from cartesia.tts import CartesiaTTS
52
- from IPython.display import Audio
51
+ import pyaudio
52
+ import os
53
53
 
54
54
  client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
-
56
55
  voices = client.get_voices()
57
- embedding = voices["Milo"]["embedding"]
56
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
58
57
  transcript = "Hello! Welcome to Cartesia"
59
58
 
60
- # No streaming
61
- output = client.generate(transcript=transcript, voice=embedding)
62
- Audio(output["audio"], rate=output["sampling_rate"])
59
+ p = pyaudio.PyAudio()
60
+
61
+ stream = None
63
62
 
64
- # Streaming
65
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
- arr = output["audio"] # a numpy array
63
+ # Generate and stream audio
64
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
+ buffer = output["audio"]
67
66
  rate = output["sampling_rate"]
67
+
68
+ if not stream:
69
+ stream = p.open(format=pyaudio.paFloat32,
70
+ channels=1,
71
+ rate=rate,
72
+ output=True)
73
+
74
+ # Write the audio data to the stream
75
+ stream.write(buffer)
76
+
77
+ stream.stop_stream()
78
+ stream.close()
79
+ p.terminate()
80
+ ```
81
+
82
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
+
84
+ ```python
85
+ from cartesia.tts import CartesiaTTS
86
+ from IPython.display import Audio
87
+ import io
88
+ import os
89
+
90
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
+ voices = client.get_voices()
92
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
+ transcript = "Hello! Welcome to Cartesia"
94
+
95
+ # Create a BytesIO object to store the audio data
96
+ audio_data = io.BytesIO()
97
+
98
+ # Generate and stream audio
99
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
+ buffer = output["audio"]
101
+ audio_data.write(buffer)
102
+
103
+ # Set the cursor position to the beginning of the BytesIO object
104
+ audio_data.seek(0)
105
+
106
+ # Create an Audio object from the BytesIO data
107
+ audio = Audio(audio_data, rate=output["sampling_rate"])
108
+
109
+ # Display the Audio object
110
+ display(audio)
68
111
  ```
69
112
 
70
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
113
+ To avoid storing your API key in the source code, we recommend doing one of the following:
114
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
115
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -0,0 +1,81 @@
1
+ # Cartesia Python API Library
2
+ The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
3
+
4
+ **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
5
+
6
+ ## Installation
7
+ ```bash
8
+ pip install cartesia
9
+
10
+ # pip install in editable mode w/ dev dependencies
11
+ pip install -e '.[dev]'
12
+ ```
13
+
14
+ ## Usage
15
+ ```python
16
+ from cartesia.tts import CartesiaTTS
17
+ import pyaudio
18
+ import os
19
+
20
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
21
+ voices = client.get_voices()
22
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
23
+ transcript = "Hello! Welcome to Cartesia"
24
+
25
+ p = pyaudio.PyAudio()
26
+
27
+ stream = None
28
+
29
+ # Generate and stream audio
30
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
31
+ buffer = output["audio"]
32
+ rate = output["sampling_rate"]
33
+
34
+ if not stream:
35
+ stream = p.open(format=pyaudio.paFloat32,
36
+ channels=1,
37
+ rate=rate,
38
+ output=True)
39
+
40
+ # Write the audio data to the stream
41
+ stream.write(buffer)
42
+
43
+ stream.stop_stream()
44
+ stream.close()
45
+ p.terminate()
46
+ ```
47
+
48
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
49
+
50
+ ```python
51
+ from cartesia.tts import CartesiaTTS
52
+ from IPython.display import Audio
53
+ import io
54
+ import os
55
+
56
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
57
+ voices = client.get_voices()
58
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
59
+ transcript = "Hello! Welcome to Cartesia"
60
+
61
+ # Create a BytesIO object to store the audio data
62
+ audio_data = io.BytesIO()
63
+
64
+ # Generate and stream audio
65
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
66
+ buffer = output["audio"]
67
+ audio_data.write(buffer)
68
+
69
+ # Set the cursor position to the beginning of the BytesIO object
70
+ audio_data.seek(0)
71
+
72
+ # Create an Audio object from the BytesIO data
73
+ audio = Audio(audio_data, rate=output["sampling_rate"])
74
+
75
+ # Display the Audio object
76
+ display(audio)
77
+ ```
78
+
79
+ To avoid storing your API key in the source code, we recommend doing one of the following:
80
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
81
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -4,7 +4,6 @@ import os
4
4
  import uuid
5
5
  from typing import Any, Dict, Generator, List, Optional, TypedDict, Union
6
6
 
7
- import numpy as np
8
7
  import requests
9
8
  from websockets.sync.client import connect
10
9
 
@@ -14,7 +13,7 @@ DEFAULT_API_VERSION = "v0"
14
13
 
15
14
 
16
15
  class AudioOutput(TypedDict):
17
- audio: np.ndarray
16
+ audio: bytes
18
17
  sampling_rate: int
19
18
 
20
19
 
@@ -32,7 +31,11 @@ class CartesiaTTS:
32
31
  """The client for Cartesia's text-to-speech library.
33
32
 
34
33
  This client contains methods to interact with the Cartesia text-to-speech API.
35
- The API offers
34
+ The client can be used to retrieve available voices, compute new voice embeddings,
35
+ and generate speech from text.
36
+
37
+ The client also supports generating audio using a websocket for lower latency.
38
+ To enable interrupt handling along the websocket, set `experimental_ws_handle_interrupts=True`.
36
39
 
37
40
  Examples:
38
41
 
@@ -56,18 +59,22 @@ class CartesiaTTS:
56
59
  ... audio, sr = audio_chunk["audio"], audio_chunk["sampling_rate"]
57
60
  """
58
61
 
59
- def __init__(self, *, api_key: str = None):
62
+ def __init__(self, *, api_key: str = None, experimental_ws_handle_interrupts: bool = False):
60
63
  """
61
64
  Args:
62
65
  api_key: The API key to use for authorization.
63
66
  If not specified, the API key will be read from the environment variable
64
67
  `CARTESIA_API_KEY`.
68
+ experimental_ws_handle_interrupts: Whether to handle interrupts when generating
69
+ audio using the websocket. This is an experimental feature and may have bugs
70
+ or be deprecated in the future.
65
71
  """
66
72
  self.base_url = os.environ.get("CARTESIA_BASE_URL", DEFAULT_BASE_URL)
67
73
  self.api_key = api_key or os.environ.get("CARTESIA_API_KEY")
68
74
  self.api_version = os.environ.get("CARTESIA_API_VERSION", DEFAULT_API_VERSION)
69
75
  self.headers = {"X-API-Key": self.api_key, "Content-Type": "application/json"}
70
76
  self.websocket = None
77
+ self.experimental_ws_handle_interrupts = experimental_ws_handle_interrupts
71
78
  self.refresh_websocket()
72
79
 
73
80
  def get_voices(self, skip_embeddings: bool = True) -> Dict[str, VoiceMetadata]:
@@ -168,21 +175,37 @@ class CartesiaTTS:
168
175
  """
169
176
  if self.websocket and not self._is_websocket_closed():
170
177
  self.websocket.close()
178
+ route = "audio/websocket"
179
+ if self.experimental_ws_handle_interrupts:
180
+ route = f"experimental/{route}"
171
181
  self.websocket = connect(
172
- f"{self._ws_url()}/audio/websocket?api_key={self.api_key}",
182
+ f"{self._ws_url()}/{route}?api_key={self.api_key}",
173
183
  close_timeout=None,
174
184
  )
175
185
 
176
186
  def _is_websocket_closed(self):
177
187
  return self.websocket.socket.fileno() == -1
178
188
 
189
+ def _check_inputs(
190
+ self, transcript: str, duration: Optional[float], chunk_time: Optional[float]
191
+ ):
192
+ if chunk_time is not None:
193
+ if chunk_time < 0.1 or chunk_time > 0.5:
194
+ raise ValueError("`chunk_time` must be between 0.1 and 0.5")
195
+
196
+ if chunk_time is not None and duration is not None:
197
+ if duration < chunk_time:
198
+ raise ValueError("`duration` must be greater than chunk_time")
199
+
200
+ if transcript.strip() == "":
201
+ raise ValueError("`transcript` must be non empty")
202
+
179
203
  def generate(
180
204
  self,
181
205
  *,
182
206
  transcript: str,
183
207
  duration: int = None,
184
208
  chunk_time: float = None,
185
- lookahead: int = None,
186
209
  voice: Embedding = None,
187
210
  stream: bool = False,
188
211
  websocket: bool = True,
@@ -194,8 +217,6 @@ class CartesiaTTS:
194
217
  duration: The maximum duration of the audio in seconds.
195
218
  chunk_time: How long each audio segment should be in seconds.
196
219
  This should not need to be adjusted.
197
- lookahead: The number of seconds to look ahead for each chunk.
198
- This should not need to be adjusted.
199
220
  voice: The voice to use for generating audio.
200
221
  This can either be a voice id (string) or an embedding vector (List[float]).
201
222
  stream: Whether to stream the audio or not.
@@ -206,18 +227,16 @@ class CartesiaTTS:
206
227
  Returns:
207
228
  A generator if `stream` is True, otherwise a dictionary.
208
229
  Dictionary from both generator and non-generator return types have the following keys:
209
- * "audio": The audio as a 1D numpy array.
230
+ * "audio": The audio as a bytes buffer.
210
231
  * "sampling_rate": The sampling rate of the audio.
211
232
  """
212
- body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
233
+ self._check_inputs(transcript, duration, chunk_time)
213
234
 
214
- if isinstance(voice, str):
215
- voice = self._voices[voice]
235
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID)
216
236
 
217
237
  optional_body = dict(
218
238
  duration=duration,
219
239
  chunk_time=chunk_time,
220
- lookahead=lookahead,
221
240
  voice=voice,
222
241
  )
223
242
  body.update({k: v for k, v in optional_body.items() if v is not None})
@@ -237,7 +256,7 @@ class CartesiaTTS:
237
256
  sampling_rate = chunk["sampling_rate"]
238
257
  chunks.append(chunk["audio"])
239
258
 
240
- return {"audio": np.concatenate(chunks), "sampling_rate": sampling_rate}
259
+ return {"audio": b"".join(chunks), "sampling_rate": sampling_rate}
241
260
 
242
261
  def _generate_http(self, body: Dict[str, Any]):
243
262
  response = requests.post(
@@ -258,8 +277,7 @@ class CartesiaTTS:
258
277
  if start_index != -1 and end_index != -1:
259
278
  try:
260
279
  chunk_json = json.loads(buffer[start_index : end_index + 1])
261
- data = base64.b64decode(chunk_json["data"])
262
- audio = np.frombuffer(data, dtype=np.float32)
280
+ audio = base64.b64decode(chunk_json["data"])
263
281
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
264
282
  buffer = buffer[end_index + 1 :]
265
283
  except json.JSONDecodeError:
@@ -268,28 +286,55 @@ class CartesiaTTS:
268
286
  if buffer:
269
287
  try:
270
288
  chunk_json = json.loads(buffer)
271
- data = base64.b64decode(chunk_json["data"])
272
- audio = np.frombuffer(data, dtype=np.float32)
289
+ audio = base64.b64decode(chunk_json["data"])
273
290
  yield {"audio": audio, "sampling_rate": chunk_json["sampling_rate"]}
274
291
  except json.JSONDecodeError:
275
292
  pass
276
293
 
277
- def _generate_ws(self, body: Dict[str, Any]):
294
+ def _generate_ws(self, body: Dict[str, Any], *, context_id: str = None):
295
+ """Generate audio using the websocket connection.
296
+
297
+ Args:
298
+ body: The request body.
299
+ context_id: The context id for the request.
300
+ The context id must be globally unique for the duration this client exists.
301
+ If this is provided, the context id that is in the response will
302
+ also be returned as part of the dict. This is helpful for testing.
303
+ """
278
304
  if not self.websocket or self._is_websocket_closed():
279
305
  self.refresh_websocket()
280
306
 
281
- self.websocket.send(json.dumps({"data": body, "context_id": uuid.uuid4().hex}))
307
+ include_context_id = bool(context_id)
308
+ if context_id is None:
309
+ context_id = uuid.uuid4().hex
310
+ self.websocket.send(json.dumps({"data": body, "context_id": context_id}))
282
311
  try:
283
- response = json.loads(self.websocket.recv())
284
- while not response["done"]:
285
- data = base64.b64decode(response["data"])
286
- audio = np.frombuffer(data, dtype=np.float32)
287
- # print("timing", time.perf_counter() - start)
288
- yield {"audio": audio, "sampling_rate": response["sampling_rate"]}
289
-
312
+ while True:
290
313
  response = json.loads(self.websocket.recv())
291
- except Exception:
292
- raise RuntimeError(f"Failed to generate audio. {response}")
314
+ if response["done"]:
315
+ break
316
+ audio = base64.b64decode(response["data"])
317
+
318
+ optional_kwargs = {}
319
+ if include_context_id:
320
+ optional_kwargs["context_id"] = response["context_id"]
321
+
322
+ yield {
323
+ "audio": audio,
324
+ "sampling_rate": response["sampling_rate"],
325
+ **optional_kwargs,
326
+ }
327
+
328
+ if self.experimental_ws_handle_interrupts:
329
+ self.websocket.send(json.dumps({"context_id": context_id}))
330
+ except GeneratorExit:
331
+ # The exit is only called when the generator is garbage collected.
332
+ # It may not be called directly after a break statement.
333
+ # However, the generator will be automatically cancelled on the next request.
334
+ if self.experimental_ws_handle_interrupts:
335
+ self.websocket.send(json.dumps({"context_id": context_id, "action": "cancel"}))
336
+ except Exception as e:
337
+ raise RuntimeError(f"Failed to generate audio. {response}") from e
293
338
 
294
339
  def _http_url(self):
295
340
  prefix = "http" if "localhost" in self.base_url else "https"
@@ -0,0 +1 @@
1
+ __version__ = "0.0.4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -12,7 +12,6 @@ Requires-Python: >=3.8.0
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: websockets
14
14
  Requires-Dist: requests
15
- Requires-Dist: numpy
16
15
  Provides-Extra: dev
17
16
  Requires-Dist: pre-commit; extra == "dev"
18
17
  Requires-Dist: docformatter; extra == "dev"
@@ -49,22 +48,68 @@ pip install -e '.[dev]'
49
48
  ## Usage
50
49
  ```python
51
50
  from cartesia.tts import CartesiaTTS
52
- from IPython.display import Audio
51
+ import pyaudio
52
+ import os
53
53
 
54
54
  client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
55
-
56
55
  voices = client.get_voices()
57
- embedding = voices["Milo"]["embedding"]
56
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
58
57
  transcript = "Hello! Welcome to Cartesia"
59
58
 
60
- # No streaming
61
- output = client.generate(transcript=transcript, voice=embedding)
62
- Audio(output["audio"], rate=output["sampling_rate"])
59
+ p = pyaudio.PyAudio()
60
+
61
+ stream = None
63
62
 
64
- # Streaming
65
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
66
- arr = output["audio"] # a numpy array
63
+ # Generate and stream audio
64
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
65
+ buffer = output["audio"]
67
66
  rate = output["sampling_rate"]
67
+
68
+ if not stream:
69
+ stream = p.open(format=pyaudio.paFloat32,
70
+ channels=1,
71
+ rate=rate,
72
+ output=True)
73
+
74
+ # Write the audio data to the stream
75
+ stream.write(buffer)
76
+
77
+ stream.stop_stream()
78
+ stream.close()
79
+ p.terminate()
80
+ ```
81
+
82
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook. Here's an example:
83
+
84
+ ```python
85
+ from cartesia.tts import CartesiaTTS
86
+ from IPython.display import Audio
87
+ import io
88
+ import os
89
+
90
+ client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
91
+ voices = client.get_voices()
92
+ voice = client.get_voice_embedding(voice_id=voices["Graham"]["id"])
93
+ transcript = "Hello! Welcome to Cartesia"
94
+
95
+ # Create a BytesIO object to store the audio data
96
+ audio_data = io.BytesIO()
97
+
98
+ # Generate and stream audio
99
+ for output in client.generate(transcript=transcript, voice=voice, stream=True):
100
+ buffer = output["audio"]
101
+ audio_data.write(buffer)
102
+
103
+ # Set the cursor position to the beginning of the BytesIO object
104
+ audio_data.seek(0)
105
+
106
+ # Create an Audio object from the BytesIO data
107
+ audio = Audio(audio_data, rate=output["sampling_rate"])
108
+
109
+ # Display the Audio object
110
+ display(audio)
68
111
  ```
69
112
 
70
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
113
+ To avoid storing your API key in the source code, we recommend doing one of the following:
114
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
115
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)
@@ -1,6 +1,5 @@
1
1
  websockets
2
2
  requests
3
- numpy
4
3
 
5
4
  [all]
6
5
  pre-commit
@@ -78,7 +78,8 @@ class UploadCommand(Command):
78
78
  """Support setup.py upload."""
79
79
 
80
80
  description = "Build and publish the package."
81
- user_options = []
81
+ user_options = [("skip-upload", "u", "skip git tagging and pypi upload")]
82
+ boolean_options = ["skip-upload"]
82
83
 
83
84
  @staticmethod
84
85
  def status(s):
@@ -86,21 +87,26 @@ class UploadCommand(Command):
86
87
  print("\033[1m{0}\033[0m".format(s))
87
88
 
88
89
  def initialize_options(self):
89
- pass
90
+ self.skip_upload = False
90
91
 
91
92
  def finalize_options(self):
92
- pass
93
+ self.skip_upload = bool(self.skip_upload)
93
94
 
94
95
  def run(self):
95
96
  try:
96
97
  self.status("Removing previous builds…")
97
98
  rmtree(os.path.join(here, "dist"))
99
+ rmtree(os.path.join(here, "build"))
98
100
  except OSError:
99
101
  pass
100
102
 
101
103
  self.status("Building Source and Wheel (universal) distribution…")
102
104
  os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
103
105
 
106
+ if self.skip_upload:
107
+ self.status("Skipping git tagging and pypi upload")
108
+ sys.exit()
109
+
104
110
  self.status("Uploading the package to PyPI via Twine…")
105
111
  os.system("twine upload dist/*")
106
112
 
@@ -116,6 +122,9 @@ class BumpVersionCommand(Command):
116
122
  To use: python setup.py bumpversion -v <version>
117
123
 
118
124
  This command will push the new version directly and tag it.
125
+
126
+ Usage:
127
+ python setup.py bumpversion --version=1.0.1
119
128
  """
120
129
 
121
130
  description = "Installs the foo."
@@ -130,6 +139,11 @@ class BumpVersionCommand(Command):
130
139
 
131
140
  def initialize_options(self):
132
141
  self.version = None
142
+ self.base_branch = None
143
+ self.version_branch = None
144
+ self.updated_files = [
145
+ "cartesia/version.py",
146
+ ]
133
147
 
134
148
  def finalize_options(self):
135
149
  # This package cannot be imported at top level because it
@@ -147,14 +161,18 @@ class BumpVersionCommand(Command):
147
161
  )
148
162
 
149
163
  def _undo(self):
150
- os.system(f"git restore --staged {PACKAGE_DIR}/__init__.py")
151
- os.system(f"git checkout -- {PACKAGE_DIR}/__init__.py")
164
+ os.system(f"git restore --staged {' '.join(self.updated_files)}")
165
+ os.system(f"git checkout -- {' '.join(self.updated_files)}")
166
+
167
+ # Return to the original branch
168
+ os.system(f"git checkout {self.base_branch}")
169
+ os.system(f"git branch -D {self.version_branch}")
152
170
 
153
171
  def run(self):
154
172
  current_version = about["__version__"]
155
173
 
156
174
  self.status("Checking current branch is 'main'")
157
- current_branch = get_git_branch()
175
+ self.base_branch = current_branch = get_git_branch()
158
176
  if current_branch != "main":
159
177
  raise RuntimeError(
160
178
  "You can only bump the version from the 'main' branch. "
@@ -174,18 +192,25 @@ class BumpVersionCommand(Command):
174
192
 
175
193
  # TODO: Add check to see if all tests are passing on main.
176
194
 
195
+ # Checkout new branch
196
+ self.version_branch = f"bumpversion/v{self.version}"
197
+ self.status(f"Create branch '{self.version_branch}'")
198
+ err_code = os.system(f"git checkout -b {self.version_branch}")
199
+ if err_code != 0:
200
+ raise RuntimeError("Failed to create branch.")
201
+
177
202
  # Change the version in __init__.py
178
203
  self.status(f"Updating version {current_version} -> {self.version}")
179
204
  update_version(self.version)
180
- if current_version != self.version:
181
- self._undo()
182
- raise RuntimeError("Failed to update version.")
205
+ # if current_version != self.version:
206
+ # self._undo()
207
+ # raise RuntimeError("Failed to update version.")
183
208
 
184
- self.status(f"Adding {PACKAGE_DIR}/__init__.py to git")
185
- err_code = os.system(f"git add {PACKAGE_DIR}/__init__.py")
209
+ self.status(f"Adding {', '.join(self.updated_files)} to git")
210
+ err_code = os.system(f"git add {' '.join(self.updated_files)}")
186
211
  if err_code != 0:
187
212
  self._undo()
188
- raise RuntimeError("Failed to add file to git.")
213
+ raise RuntimeError("Failed to add files to git.")
189
214
 
190
215
  # Commit the file with a message '[bumpversion] v<version>'.
191
216
  self.status(f"Commit with message '[bumpversion] v{self.version}'")
@@ -195,12 +220,15 @@ class BumpVersionCommand(Command):
195
220
  raise RuntimeError("Failed to commit file to git.")
196
221
 
197
222
  # Push the commit to origin.
198
- # self.status("Pushing commit to origin")
199
- # err_code = os.system("git push")
200
- # if err_code != 0:
201
- # # TODO: undo the commit automatically.
202
- # raise RuntimeError("Failed to push commit to origin.")
223
+ self.status(f"Pushing commit to origin/{self.version_branch}")
224
+ err_code = os.system(f"git push --force --set-upstream origin {self.version_branch}")
225
+ if err_code != 0:
226
+ # TODO: undo the commit automatically.
227
+ self._undo()
228
+ raise RuntimeError("Failed to push commit to origin.")
203
229
 
230
+ os.system(f"git checkout {self.base_branch}")
231
+ os.system(f"git branch -D {self.version_branch}")
204
232
  sys.exit()
205
233
 
206
234
 
@@ -0,0 +1,180 @@
1
+ """Test against the production Cartesia TTS API.
2
+
3
+ This test suite tries to be as general as possible because different keys
4
+ will lead to different results. Therefore, we cannot test for complete correctness
5
+ but rather for general correctness.
6
+ """
7
+
8
+ import os
9
+ import uuid
10
+ from typing import Dict, Generator, List
11
+
12
+ import pytest
13
+
14
+ from cartesia.tts import DEFAULT_MODEL_ID, CartesiaTTS, VoiceMetadata
15
+
16
+ SAMPLE_VOICE = "Milo"
17
+
18
+
19
+ class _Resources:
20
+ def __init__(self, *, client: CartesiaTTS, voices: Dict[str, VoiceMetadata]):
21
+ self.client = client
22
+ self.voices = voices
23
+
24
+
25
+ @pytest.fixture(scope="session")
26
+ def client():
27
+ return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
28
+
29
+
30
+ @pytest.fixture(scope="session")
31
+ def client_with_ws_interrupt():
32
+ return CartesiaTTS(
33
+ api_key=os.environ.get("CARTESIA_API_KEY"), experimental_ws_handle_interrupts=True
34
+ )
35
+
36
+
37
+ @pytest.fixture(scope="session")
38
+ def resources(client: CartesiaTTS):
39
+ voices = client.get_voices()
40
+ voice_id = voices[SAMPLE_VOICE]["id"]
41
+ voices[SAMPLE_VOICE]["embedding"] = client.get_voice_embedding(voice_id=voice_id)
42
+
43
+ return _Resources(
44
+ client=client,
45
+ voices=voices,
46
+ )
47
+
48
+
49
+ def test_get_voices(client: CartesiaTTS):
50
+ voices = client.get_voices()
51
+
52
+ assert isinstance(voices, dict)
53
+ assert all(isinstance(key, str) for key in voices.keys())
54
+ ids = [voice["id"] for voice in voices.values()]
55
+ assert len(ids) == len(set(ids)), "All ids must be unique"
56
+ assert all(
57
+ key == voice["name"] for key, voice in voices.items()
58
+ ), "The key must be the same as the name"
59
+
60
+
61
+ def test_get_voice_embedding_from_id(client: CartesiaTTS):
62
+ voices = client.get_voices()
63
+ voice_id = voices[SAMPLE_VOICE]["id"]
64
+
65
+ client.get_voice_embedding(voice_id=voice_id)
66
+
67
+
68
+ def test_get_voice_embedding_from_url(client: CartesiaTTS):
69
+ url = "https://youtu.be/g2Z7Ddd573M?si=P8BM_hBqt5P8Ft6I&t=69"
70
+ _ = client.get_voice_embedding(link=url)
71
+
72
+
73
+ @pytest.mark.parametrize("websocket", [True, False])
74
+ def test_generate(resources: _Resources, websocket: bool):
75
+ client = resources.client
76
+ voices = resources.voices
77
+ embedding = voices[SAMPLE_VOICE]["embedding"]
78
+ transcript = "Hello, world!"
79
+
80
+ output = client.generate(transcript=transcript, voice=embedding, websocket=websocket)
81
+ assert output.keys() == {"audio", "sampling_rate"}
82
+ assert isinstance(output["audio"], bytes)
83
+ assert isinstance(output["sampling_rate"], int)
84
+
85
+
86
+ @pytest.mark.parametrize("websocket", [True, False])
87
+ def test_generate_stream(resources: _Resources, websocket: bool):
88
+ client = resources.client
89
+ voices = resources.voices
90
+ embedding = voices[SAMPLE_VOICE]["embedding"]
91
+ transcript = "Hello, world!"
92
+
93
+ generator = client.generate(
94
+ transcript=transcript, voice=embedding, websocket=websocket, stream=True
95
+ )
96
+ assert isinstance(generator, Generator)
97
+
98
+ for output in generator:
99
+ assert output.keys() == {"audio", "sampling_rate"}
100
+ assert isinstance(output["audio"], bytes)
101
+ assert isinstance(output["sampling_rate"], int)
102
+
103
+
104
+ @pytest.mark.parametrize(
105
+ "actions",
106
+ [
107
+ ["cancel-5", None],
108
+ ["cancel-5", "cancel-1", None],
109
+ [None, "cancel-3", None],
110
+ [None, "cancel-1", "cancel-2"],
111
+ ],
112
+ )
113
+ def test_generate_stream_interrupt(
114
+ client_with_ws_interrupt: CartesiaTTS, resources: _Resources, actions: List[str]
115
+ ):
116
+ client = client_with_ws_interrupt
117
+ voices = resources.voices
118
+ embedding = voices[SAMPLE_VOICE]["embedding"]
119
+ transcript = "Hello, world!"
120
+
121
+ context_ids = [f"test-{uuid.uuid4().hex[:6]}" for _ in range(len(actions))]
122
+
123
+ for context_id, action in zip(context_ids, actions):
124
+ body = dict(transcript=transcript, model_id=DEFAULT_MODEL_ID, voice=embedding)
125
+
126
+ # Parse actions to see what we should expect.
127
+ if action is None:
128
+ num_turns = None
129
+ elif "cancel" in action:
130
+ num_turns = int(action.split("-")[1])
131
+
132
+ generator = client._generate_ws(body, context_id=context_id)
133
+ for idx, response in enumerate(generator):
134
+ assert response.keys() == {"audio", "sampling_rate", "context_id"}
135
+ assert response["context_id"] == context_id, (
136
+ f"Context ID from response ({response['context_id']}) does not match "
137
+ f"the expected context ID ({context_id})"
138
+ )
139
+ if idx + 1 == num_turns:
140
+ break
141
+
142
+
143
+ @pytest.mark.parametrize("chunk_time", [0.05, 0.6])
144
+ def test_check_inputs_invalid_chunk_time(client: CartesiaTTS, chunk_time):
145
+ with pytest.raises(ValueError, match="`chunk_time` must be between 0.1 and 0.5"):
146
+ client._check_inputs("Test", None, chunk_time)
147
+
148
+
149
+ @pytest.mark.parametrize("chunk_time", [0.1, 0.3, 0.5])
150
+ def test_check_inputs_valid_chunk_time(client, chunk_time):
151
+ try:
152
+ client._check_inputs("Test", None, chunk_time)
153
+ except ValueError:
154
+ pytest.fail("Unexpected ValueError raised")
155
+
156
+
157
+ def test_check_inputs_duration_less_than_chunk_time(client: CartesiaTTS):
158
+ with pytest.raises(ValueError, match="`duration` must be greater than chunk_time"):
159
+ client._check_inputs("Test", 0.2, 0.3)
160
+
161
+
162
+ @pytest.mark.parametrize("duration,chunk_time", [(0.5, 0.2), (1.0, 0.5), (2.0, 0.1)])
163
+ def test_check_inputs_valid_duration_and_chunk_time(client: CartesiaTTS, duration, chunk_time):
164
+ try:
165
+ client._check_inputs("Test", duration, chunk_time)
166
+ except ValueError:
167
+ pytest.fail("Unexpected ValueError raised")
168
+
169
+
170
+ def test_check_inputs_empty_transcript(client: CartesiaTTS):
171
+ with pytest.raises(ValueError, match="`transcript` must be non empty"):
172
+ client._check_inputs("", None, None)
173
+
174
+
175
+ @pytest.mark.parametrize("transcript", ["Hello", "Test transcript", "Lorem ipsum dolor sit amet"])
176
+ def test_check_inputs_valid_transcript(client: CartesiaTTS, transcript):
177
+ try:
178
+ client._check_inputs(transcript, None, None)
179
+ except ValueError:
180
+ pytest.fail("Unexpected ValueError raised")
cartesia-0.0.2/README.md DELETED
@@ -1,35 +0,0 @@
1
- # Cartesia Python API Library
2
- The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
3
-
4
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
5
-
6
- ## Installation
7
- ```bash
8
- pip install cartesia
9
-
10
- # pip install in editable mode w/ dev dependencies
11
- pip install -e '.[dev]'
12
- ```
13
-
14
- ## Usage
15
- ```python
16
- from cartesia.tts import CartesiaTTS
17
- from IPython.display import Audio
18
-
19
- client = CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
20
-
21
- voices = client.get_voices()
22
- embedding = voices["Milo"]["embedding"]
23
- transcript = "Hello! Welcome to Cartesia"
24
-
25
- # No streaming
26
- output = client.generate(transcript=transcript, voice=embedding)
27
- Audio(output["audio"], rate=output["sampling_rate"])
28
-
29
- # Streaming
30
- for output in client.generate(transcript=transcript, voice=embedding, stream=True):
31
- arr = output["audio"] # a numpy array
32
- rate = output["sampling_rate"]
33
- ```
34
-
35
- We recommend using [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file so that your API Key is not stored in the source code.
@@ -1 +0,0 @@
1
- __version__ = "0.0.2"
@@ -1,96 +0,0 @@
1
- """Test against the production Cartesia TTS API.
2
-
3
- This test suite tries to be as general as possible because different keys
4
- will lead to different results. Therefore, we cannot test for complete correctness
5
- but rather for general correctness.
6
- """
7
-
8
- import os
9
- from typing import Dict, Generator
10
-
11
- import numpy as np
12
- import pytest
13
-
14
- from cartesia.tts import CartesiaTTS, VoiceMetadata
15
-
16
- SAMPLE_VOICE = "Milo"
17
-
18
-
19
- class _Resources:
20
- def __init__(self, *, client: CartesiaTTS, voices: Dict[str, VoiceMetadata]):
21
- self.client = client
22
- self.voices = voices
23
-
24
-
25
- @pytest.fixture(scope="session")
26
- def client():
27
- return CartesiaTTS(api_key=os.environ.get("CARTESIA_API_KEY"))
28
-
29
-
30
- @pytest.fixture(scope="session")
31
- def resources(client: CartesiaTTS):
32
- voices = client.get_voices()
33
- voice_id = voices[SAMPLE_VOICE]["id"]
34
- voices[SAMPLE_VOICE]["embedding"] = client.get_voice_embedding(voice_id=voice_id)
35
-
36
- return _Resources(
37
- client=client,
38
- voices=voices,
39
- )
40
-
41
-
42
- def test_get_voices(client: CartesiaTTS):
43
- voices = client.get_voices()
44
-
45
- assert isinstance(voices, dict)
46
- assert all(isinstance(key, str) for key in voices.keys())
47
- ids = [voice["id"] for voice in voices.values()]
48
- assert len(ids) == len(set(ids)), "All ids must be unique"
49
- assert all(
50
- key == voice["name"] for key, voice in voices.items()
51
- ), "The key must be the same as the name"
52
-
53
-
54
- def test_get_voice_embedding_from_id(client: CartesiaTTS):
55
- voices = client.get_voices()
56
- voice_id = voices[SAMPLE_VOICE]["id"]
57
-
58
- client.get_voice_embedding(voice_id=voice_id)
59
-
60
-
61
- def test_get_voice_embedding_from_url(client: CartesiaTTS):
62
- url = "https://youtu.be/g2Z7Ddd573M?si=P8BM_hBqt5P8Ft6I&t=69"
63
- _ = client.get_voice_embedding(link=url)
64
-
65
-
66
- @pytest.mark.parametrize("websocket", [True, False])
67
- def test_generate(resources: _Resources, websocket: bool):
68
- client = resources.client
69
- voices = resources.voices
70
- embedding = voices[SAMPLE_VOICE]["embedding"]
71
- transcript = "Hello, world!"
72
-
73
- output = client.generate(transcript=transcript, voice=embedding, websocket=websocket)
74
- assert output.keys() == {"audio", "sampling_rate"}
75
- assert isinstance(output["audio"], np.ndarray)
76
- assert output["audio"].dtype == np.float32
77
- assert isinstance(output["sampling_rate"], int)
78
-
79
-
80
- @pytest.mark.parametrize("websocket", [True, False])
81
- def test_generate_stream(resources: _Resources, websocket: bool):
82
- client = resources.client
83
- voices = resources.voices
84
- embedding = voices[SAMPLE_VOICE]["embedding"]
85
- transcript = "Hello, world!"
86
-
87
- generator = client.generate(
88
- transcript=transcript, voice=embedding, websocket=websocket, stream=True
89
- )
90
- assert isinstance(generator, Generator)
91
-
92
- for output in generator:
93
- assert output.keys() == {"audio", "sampling_rate"}
94
- assert isinstance(output["audio"], np.ndarray)
95
- assert output["audio"].dtype == np.float32
96
- assert isinstance(output["sampling_rate"], int)
File without changes
File without changes
File without changes