cartesia 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -21,10 +21,8 @@ Provides-Extra: all
21
21
 
22
22
  The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
23
23
 
24
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
25
-
26
24
  > [!IMPORTANT]
27
- > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/discussions/44) here and reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) if you have any questions!
25
+ > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
28
26
 
29
27
  ## Documentation
30
28
 
@@ -59,7 +57,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
59
57
  cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
60
58
 
61
59
  # Create a new voice
62
- new_voice = client.voices.create(name="New Voice", description="A clone of my own voice", embedding=cloned_voice_embedding)
60
+ new_voice = client.voices.create(
61
+ name="New Voice",
62
+ description="A clone of my own voice",
63
+ embedding=cloned_voice_embedding,
64
+ )
63
65
  ```
64
66
 
65
67
  ## Text-to-Speech
@@ -94,14 +96,17 @@ rate = 44100
94
96
  stream = None
95
97
 
96
98
  # Generate and stream audio
97
- for output in client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format):
99
+ for output in client.tts.sse(
100
+ model_id=model_id,
101
+ transcript=transcript,
102
+ voice_embedding=voice["embedding"],
103
+ stream=True,
104
+ output_format=output_format,
105
+ ):
98
106
  buffer = output["audio"]
99
107
 
100
108
  if not stream:
101
- stream = p.open(format=pyaudio.paFloat32,
102
- channels=1,
103
- rate=rate,
104
- output=True)
109
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
105
110
 
106
111
  # Write the audio data to the stream
107
112
  stream.write(buffer)
@@ -119,6 +124,7 @@ import asyncio
119
124
  import pyaudio
120
125
  import os
121
126
 
127
+
122
128
  async def write_stream():
123
129
  client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
124
130
  voice_name = "Barbershop Man"
@@ -141,15 +147,19 @@ async def write_stream():
141
147
  stream = None
142
148
 
143
149
  # Generate and stream audio
144
- async for output in await client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format
150
+ async for output in await client.tts.sse(
151
+ model_id=model_id,
152
+ transcript=transcript,
153
+ voice_embedding=voice["embedding"],
154
+ stream=True,
155
+ output_format=output_format,
145
156
  ):
146
157
  buffer = output["audio"]
147
158
 
148
159
  if not stream:
149
- stream = p.open(format=pyaudio.paFloat32,
150
- channels=1,
151
- rate=rate,
152
- output=True)
160
+ stream = p.open(
161
+ format=pyaudio.paFloat32, channels=1, rate=rate, output=True
162
+ )
153
163
 
154
164
  # Write the audio data to the stream
155
165
  stream.write(buffer)
@@ -157,6 +167,8 @@ async def write_stream():
157
167
  stream.stop_stream()
158
168
  stream.close()
159
169
  p.terminate()
170
+ await client.close()
171
+
160
172
 
161
173
  asyncio.run(write_stream())
162
174
  ```
@@ -193,14 +205,17 @@ stream = None
193
205
  ws = client.tts.websocket()
194
206
 
195
207
  # Generate and stream audio using the websocket
196
- for output in ws.send(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format):
208
+ for output in ws.send(
209
+ model_id=model_id,
210
+ transcript=transcript,
211
+ voice_embedding=voice["embedding"],
212
+ stream=True,
213
+ output_format=output_format,
214
+ ):
197
215
  buffer = output["audio"]
198
216
 
199
217
  if not stream:
200
- stream = p.open(format=pyaudio.paFloat32,
201
- channels=1,
202
- rate=rate,
203
- output=True)
218
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
204
219
 
205
220
  # Write the audio data to the stream
206
221
  stream.write(buffer)
@@ -209,7 +224,7 @@ stream.stop_stream()
209
224
  stream.close()
210
225
  p.terminate()
211
226
 
212
- ws.close() # Close the websocket connection
227
+ ws.close() # Close the websocket connection
213
228
  ```
214
229
 
215
230
  ### Multilingual Text-to-Speech [Alpha]
@@ -245,14 +260,18 @@ rate = 44100
245
260
  stream = None
246
261
 
247
262
  # Pass in the corresponding language code to the `language` parameter to generate and stream audio.
248
- for output in client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format, language=language):
263
+ for output in client.tts.sse(
264
+ model_id=model_id,
265
+ transcript=transcript,
266
+ voice_embedding=voice["embedding"],
267
+ stream=True,
268
+ output_format=output_format,
269
+ language=language,
270
+ ):
249
271
  buffer = output["audio"]
250
272
 
251
273
  if not stream:
252
- stream = p.open(format=pyaudio.paFloat32,
253
- channels=1,
254
- rate=rate,
255
- output=True)
274
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
256
275
 
257
276
  stream.write(buffer)
258
277
 
@@ -287,7 +306,12 @@ with Cartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
287
306
  audio_data = io.BytesIO()
288
307
 
289
308
  # Generate and stream audio
290
- for output in client.tts.sse(model_id="sonic-english", transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format
309
+ for output in client.tts.sse(
310
+ model_id="sonic-english",
311
+ transcript=transcript,
312
+ voice_embedding=voice["embedding"],
313
+ stream=True,
314
+ output_format=output_format,
291
315
  ):
292
316
  buffer = output["audio"]
293
317
  audio_data.write(buffer)
@@ -326,7 +350,12 @@ async with AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
326
350
  audio_data = io.BytesIO()
327
351
 
328
352
  # Generate and stream audio
329
- async for output in client.tts.sse(model_id="sonic-english", transcript=transcript, voice_id=voice_id, stream=True, output_format=output_format
353
+ async for output in client.tts.sse(
354
+ model_id="sonic-english",
355
+ transcript=transcript,
356
+ voice_id=voice_id,
357
+ stream=True,
358
+ output_format=output_format,
330
359
  ):
331
360
  buffer = output["audio"]
332
361
  audio_data.write(buffer)
@@ -341,6 +370,28 @@ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
341
370
  display(audio)
342
371
  ```
343
372
 
373
+ ### Utility methods
374
+
375
+ #### Output Formats
376
+
377
+ You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
378
+
379
+ The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
380
+
381
+ ```python
382
+ # Get the output format dictionary from string name
383
+ output_format = client.tts.get_output_format("raw_pcm_f32le_44100")
384
+
385
+ # Pass in the output format dictionary to generate and stream audio
386
+ generator = client.tts.sse(
387
+ model_id=model,
388
+ transcript=transcript,
389
+ voice_id=SAMPLE_VOICE_ID,
390
+ stream=True,
391
+ output_format=output_format,
392
+ )
393
+ ```
394
+
344
395
  To avoid storing your API key in the source code, we recommend doing one of the following:
345
396
 
346
397
  1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
@@ -5,10 +5,8 @@
5
5
 
6
6
  The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
7
7
 
8
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
9
-
10
8
  > [!IMPORTANT]
11
- > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/discussions/44) here and reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) if you have any questions!
9
+ > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
12
10
 
13
11
  ## Documentation
14
12
 
@@ -43,7 +41,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
43
41
  cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
44
42
 
45
43
  # Create a new voice
46
- new_voice = client.voices.create(name="New Voice", description="A clone of my own voice", embedding=cloned_voice_embedding)
44
+ new_voice = client.voices.create(
45
+ name="New Voice",
46
+ description="A clone of my own voice",
47
+ embedding=cloned_voice_embedding,
48
+ )
47
49
  ```
48
50
 
49
51
  ## Text-to-Speech
@@ -78,14 +80,17 @@ rate = 44100
78
80
  stream = None
79
81
 
80
82
  # Generate and stream audio
81
- for output in client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format):
83
+ for output in client.tts.sse(
84
+ model_id=model_id,
85
+ transcript=transcript,
86
+ voice_embedding=voice["embedding"],
87
+ stream=True,
88
+ output_format=output_format,
89
+ ):
82
90
  buffer = output["audio"]
83
91
 
84
92
  if not stream:
85
- stream = p.open(format=pyaudio.paFloat32,
86
- channels=1,
87
- rate=rate,
88
- output=True)
93
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
89
94
 
90
95
  # Write the audio data to the stream
91
96
  stream.write(buffer)
@@ -103,6 +108,7 @@ import asyncio
103
108
  import pyaudio
104
109
  import os
105
110
 
111
+
106
112
  async def write_stream():
107
113
  client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
108
114
  voice_name = "Barbershop Man"
@@ -125,15 +131,19 @@ async def write_stream():
125
131
  stream = None
126
132
 
127
133
  # Generate and stream audio
128
- async for output in await client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format
134
+ async for output in await client.tts.sse(
135
+ model_id=model_id,
136
+ transcript=transcript,
137
+ voice_embedding=voice["embedding"],
138
+ stream=True,
139
+ output_format=output_format,
129
140
  ):
130
141
  buffer = output["audio"]
131
142
 
132
143
  if not stream:
133
- stream = p.open(format=pyaudio.paFloat32,
134
- channels=1,
135
- rate=rate,
136
- output=True)
144
+ stream = p.open(
145
+ format=pyaudio.paFloat32, channels=1, rate=rate, output=True
146
+ )
137
147
 
138
148
  # Write the audio data to the stream
139
149
  stream.write(buffer)
@@ -141,6 +151,8 @@ async def write_stream():
141
151
  stream.stop_stream()
142
152
  stream.close()
143
153
  p.terminate()
154
+ await client.close()
155
+
144
156
 
145
157
  asyncio.run(write_stream())
146
158
  ```
@@ -177,14 +189,17 @@ stream = None
177
189
  ws = client.tts.websocket()
178
190
 
179
191
  # Generate and stream audio using the websocket
180
- for output in ws.send(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format):
192
+ for output in ws.send(
193
+ model_id=model_id,
194
+ transcript=transcript,
195
+ voice_embedding=voice["embedding"],
196
+ stream=True,
197
+ output_format=output_format,
198
+ ):
181
199
  buffer = output["audio"]
182
200
 
183
201
  if not stream:
184
- stream = p.open(format=pyaudio.paFloat32,
185
- channels=1,
186
- rate=rate,
187
- output=True)
202
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
188
203
 
189
204
  # Write the audio data to the stream
190
205
  stream.write(buffer)
@@ -193,7 +208,7 @@ stream.stop_stream()
193
208
  stream.close()
194
209
  p.terminate()
195
210
 
196
- ws.close() # Close the websocket connection
211
+ ws.close() # Close the websocket connection
197
212
  ```
198
213
 
199
214
  ### Multilingual Text-to-Speech [Alpha]
@@ -229,14 +244,18 @@ rate = 44100
229
244
  stream = None
230
245
 
231
246
  # Pass in the corresponding language code to the `language` parameter to generate and stream audio.
232
- for output in client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format, language=language):
247
+ for output in client.tts.sse(
248
+ model_id=model_id,
249
+ transcript=transcript,
250
+ voice_embedding=voice["embedding"],
251
+ stream=True,
252
+ output_format=output_format,
253
+ language=language,
254
+ ):
233
255
  buffer = output["audio"]
234
256
 
235
257
  if not stream:
236
- stream = p.open(format=pyaudio.paFloat32,
237
- channels=1,
238
- rate=rate,
239
- output=True)
258
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
240
259
 
241
260
  stream.write(buffer)
242
261
 
@@ -271,7 +290,12 @@ with Cartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
271
290
  audio_data = io.BytesIO()
272
291
 
273
292
  # Generate and stream audio
274
- for output in client.tts.sse(model_id="sonic-english", transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format
293
+ for output in client.tts.sse(
294
+ model_id="sonic-english",
295
+ transcript=transcript,
296
+ voice_embedding=voice["embedding"],
297
+ stream=True,
298
+ output_format=output_format,
275
299
  ):
276
300
  buffer = output["audio"]
277
301
  audio_data.write(buffer)
@@ -310,7 +334,12 @@ async with AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
310
334
  audio_data = io.BytesIO()
311
335
 
312
336
  # Generate and stream audio
313
- async for output in client.tts.sse(model_id="sonic-english", transcript=transcript, voice_id=voice_id, stream=True, output_format=output_format
337
+ async for output in client.tts.sse(
338
+ model_id="sonic-english",
339
+ transcript=transcript,
340
+ voice_id=voice_id,
341
+ stream=True,
342
+ output_format=output_format,
314
343
  ):
315
344
  buffer = output["audio"]
316
345
  audio_data.write(buffer)
@@ -325,6 +354,28 @@ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
325
354
  display(audio)
326
355
  ```
327
356
 
357
+ ### Utility methods
358
+
359
+ #### Output Formats
360
+
361
+ You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
362
+
363
+ The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
364
+
365
+ ```python
366
+ # Get the output format dictionary from string name
367
+ output_format = client.tts.get_output_format("raw_pcm_f32le_44100")
368
+
369
+ # Pass in the output format dictionary to generate and stream audio
370
+ generator = client.tts.sse(
371
+ model_id=model,
372
+ transcript=transcript,
373
+ voice_id=SAMPLE_VOICE_ID,
374
+ stream=True,
375
+ output_format=output_format,
376
+ )
377
+ ```
378
+
328
379
  To avoid storing your API key in the source code, we recommend doing one of the following:
329
380
 
330
381
  1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
@@ -0,0 +1,75 @@
1
+ from typing import List, TypedDict
2
+ from cartesia.utils.deprecated import deprecated
3
+
4
+
5
+ class OutputFormatMapping:
6
+ _format_mapping = {
7
+ "raw_pcm_f32le_44100": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
8
+ "raw_pcm_s16le_44100": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
9
+ "raw_pcm_f32le_24000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 24000},
10
+ "raw_pcm_s16le_24000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 24000},
11
+ "raw_pcm_f32le_22050": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 22050},
12
+ "raw_pcm_s16le_22050": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 22050},
13
+ "raw_pcm_f32le_16000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 16000},
14
+ "raw_pcm_s16le_16000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
15
+ "raw_pcm_f32le_8000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 8000},
16
+ "raw_pcm_s16le_8000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 8000},
17
+ "raw_pcm_mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
18
+ "raw_pcm_alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
19
+ }
20
+
21
+ @classmethod
22
+ def get_format(cls, format_name):
23
+ if format_name in cls._format_mapping:
24
+ return cls._format_mapping[format_name]
25
+ else:
26
+ raise ValueError(f"Unsupported format: {format_name}")
27
+
28
+
29
+ class DeprecatedOutputFormatMapping:
30
+ """Deprecated formats as of v1.0.1. These will be removed in v1.2.0. Use :class:`OutputFormatMapping` instead."""
31
+
32
+ _format_mapping = {
33
+ "fp32": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
34
+ "pcm": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
35
+ "fp32_8000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 8000},
36
+ "fp32_16000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 16000},
37
+ "fp32_22050": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 22050},
38
+ "fp32_24000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 24000},
39
+ "fp32_44100": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
40
+ "pcm_8000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 8000},
41
+ "pcm_16000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
42
+ "pcm_22050": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 22050},
43
+ "pcm_24000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 24000},
44
+ "pcm_44100": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
45
+ "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
46
+ "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
47
+ }
48
+
49
+ @deprecated(
50
+ vdeprecated="1.0.1",
51
+ vremove="1.2.0",
52
+ reason="Old output format names are being deprecated in favor of names aligned with the Cartesia API. Use names from `OutputFormatMapping` instead.",
53
+ )
54
+ def get_format_deprecated(self, format_name):
55
+ if format_name in self._format_mapping:
56
+ return self._format_mapping[format_name]
57
+ else:
58
+ raise ValueError(f"Unsupported format: {format_name}")
59
+
60
+
61
+ class VoiceMetadata(TypedDict):
62
+ id: str
63
+ name: str
64
+ description: str
65
+ embedding: List[float]
66
+ is_public: bool
67
+ user_id: str
68
+ created_at: str
69
+ language: str
70
+
71
+
72
+ class OutputFormat(TypedDict):
73
+ container: str
74
+ encoding: str
75
+ sample_rate: int
@@ -12,10 +12,12 @@ import logging
12
12
  import requests
13
13
  from websockets.sync.client import connect
14
14
 
15
- from cartesia.utils import retry_on_connection_error, retry_on_connection_error_async
15
+ from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
16
+ from cartesia.utils.deprecated import deprecated
16
17
  from cartesia._types import (
17
18
  OutputFormat,
18
19
  OutputFormatMapping,
20
+ DeprecatedOutputFormatMapping,
19
21
  VoiceMetadata,
20
22
  )
21
23
 
@@ -131,14 +133,7 @@ class Voices(Resource):
131
133
  """List all voices in your voice library.
132
134
 
133
135
  Returns:
134
- This method returns a list of VoiceMetadata objects with the following keys:
135
- - id: The ID of the voice.
136
- - name: The name of the voice.
137
- - description: The description of the voice.
138
- - embedding: The embedding of the voice.
139
- - is_public: Whether the voice is public.
140
- - user_id: The ID of the user who created the voice.
141
- - created_at: The timestamp (str) when the voice was created.
136
+ This method returns a list of VoiceMetadata objects.
142
137
  """
143
138
  response = httpx.get(
144
139
  f"{self._http_url()}/voices",
@@ -159,14 +154,7 @@ class Voices(Resource):
159
154
  id: The ID of the voice.
160
155
 
161
156
  Returns:
162
- A dictionary containing the voice metadata with the following keys:
163
- - id: The ID of the voice.
164
- - name: The name of the voice.
165
- - description: The description of the voice.
166
- - embedding: The embedding of the voice as a list of floats.
167
- - is_public: Whether the voice is public.
168
- - user_id: The ID of the user who created the voice.
169
- - created_at: The timestamp when the voice was created.
157
+ A VoiceMetadata object containing the voice metadata.
170
158
  """
171
159
  url = f"{self._http_url()}/voices/{id}"
172
160
  response = httpx.get(url, headers=self.headers, timeout=self.timeout)
@@ -344,8 +332,11 @@ class _WebSocket:
344
332
  stream: Whether to stream the audio or not. (Default is True)
345
333
 
346
334
  Returns:
347
- If `stream` is True, the method returns a generator that yields chunks of audio as bytes.
348
- If `stream` is False, the method returns a dictionary containing the concatenated audio as bytes and the context ID.
335
+ If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
336
+ If `stream` is False, the method returns a dictionary.
337
+ Both the generator and the dictionary contain the following key(s):
338
+ - audio: The audio as bytes.
339
+ - context_id: The context ID for the request.
349
340
  """
350
341
  self.connect()
351
342
 
@@ -490,8 +481,10 @@ class _SSE:
490
481
  stream: Whether to stream the audio or not.
491
482
 
492
483
  Returns:
493
- If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary containing the audio as bytes.
494
- If `stream` is False, the method returns a dictionary containing the audio as bytes.
484
+ If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
485
+ If `stream` is False, the method returns a dictionary.
486
+ Both the generator and the dictionary contain the following key(s):
487
+ - audio: The audio as bytes.
495
488
  """
496
489
  voice = self._validate_and_construct_voice(voice_id, voice_embedding)
497
490
 
@@ -581,15 +574,26 @@ class TTS(Resource):
581
574
  return ws
582
575
 
583
576
  def get_output_format(self, output_format_name: str) -> OutputFormat:
584
- """Convenience method to get the output_format object from a given output format name.
577
+ """Convenience method to get the output_format dictionary from a given output format name.
585
578
 
586
579
  Args:
587
580
  output_format_name (str): The name of the output format.
588
581
 
589
582
  Returns:
590
583
  OutputFormat: A dictionary containing the details of the output format to be passed into tts.sse() or tts.websocket().send()
584
+
585
+ Raises:
586
+ ValueError: If the output_format name is not supported
591
587
  """
592
- output_format_obj = OutputFormatMapping.get_format(output_format_name)
588
+ if output_format_name in OutputFormatMapping._format_mapping:
589
+ output_format_obj = OutputFormatMapping.get_format(output_format_name)
590
+ elif output_format_name in DeprecatedOutputFormatMapping._format_mapping:
591
+ output_format_obj = DeprecatedOutputFormatMapping.get_format_deprecated(
592
+ output_format_name
593
+ )
594
+ else:
595
+ raise ValueError(f"Unsupported format: {output_format_name}")
596
+
593
597
  return OutputFormat(
594
598
  container=output_format_obj["container"],
595
599
  encoding=output_format_obj["encoding"],
@@ -604,8 +608,19 @@ class TTS(Resource):
604
608
 
605
609
  Returns:
606
610
  int: The sample rate for the output format.
611
+
612
+ Raises:
613
+ ValueError: If the output_format name is not supported
607
614
  """
608
- output_format_obj = OutputFormatMapping.get_format(output_format_name)
615
+ if output_format_name in OutputFormatMapping._format_mapping:
616
+ output_format_obj = OutputFormatMapping.get_format(output_format_name)
617
+ elif output_format_name in DeprecatedOutputFormatMapping._format_mapping:
618
+ output_format_obj = DeprecatedOutputFormatMapping.get_format_deprecated(
619
+ output_format_name
620
+ )
621
+ else:
622
+ raise ValueError(f"Unsupported format: {output_format_name}")
623
+
609
624
  return output_format_obj["sample_rate"]
610
625
 
611
626
 
@@ -0,0 +1 @@
1
+ __version__ = "1.0.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -21,10 +21,8 @@ Provides-Extra: all
21
21
 
22
22
  The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
23
23
 
24
- **Note:** This API is still in alpha. Please expect breaking changes and report any issues you encounter.
25
-
26
24
  > [!IMPORTANT]
27
- > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/discussions/44) here and reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) if you have any questions!
25
+ > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
28
26
 
29
27
  ## Documentation
30
28
 
@@ -59,7 +57,11 @@ print("The embedding for", voice["name"], "is", voice["embedding"])
59
57
  cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
60
58
 
61
59
  # Create a new voice
62
- new_voice = client.voices.create(name="New Voice", description="A clone of my own voice", embedding=cloned_voice_embedding)
60
+ new_voice = client.voices.create(
61
+ name="New Voice",
62
+ description="A clone of my own voice",
63
+ embedding=cloned_voice_embedding,
64
+ )
63
65
  ```
64
66
 
65
67
  ## Text-to-Speech
@@ -94,14 +96,17 @@ rate = 44100
94
96
  stream = None
95
97
 
96
98
  # Generate and stream audio
97
- for output in client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format):
99
+ for output in client.tts.sse(
100
+ model_id=model_id,
101
+ transcript=transcript,
102
+ voice_embedding=voice["embedding"],
103
+ stream=True,
104
+ output_format=output_format,
105
+ ):
98
106
  buffer = output["audio"]
99
107
 
100
108
  if not stream:
101
- stream = p.open(format=pyaudio.paFloat32,
102
- channels=1,
103
- rate=rate,
104
- output=True)
109
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
105
110
 
106
111
  # Write the audio data to the stream
107
112
  stream.write(buffer)
@@ -119,6 +124,7 @@ import asyncio
119
124
  import pyaudio
120
125
  import os
121
126
 
127
+
122
128
  async def write_stream():
123
129
  client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
124
130
  voice_name = "Barbershop Man"
@@ -141,15 +147,19 @@ async def write_stream():
141
147
  stream = None
142
148
 
143
149
  # Generate and stream audio
144
- async for output in await client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format
150
+ async for output in await client.tts.sse(
151
+ model_id=model_id,
152
+ transcript=transcript,
153
+ voice_embedding=voice["embedding"],
154
+ stream=True,
155
+ output_format=output_format,
145
156
  ):
146
157
  buffer = output["audio"]
147
158
 
148
159
  if not stream:
149
- stream = p.open(format=pyaudio.paFloat32,
150
- channels=1,
151
- rate=rate,
152
- output=True)
160
+ stream = p.open(
161
+ format=pyaudio.paFloat32, channels=1, rate=rate, output=True
162
+ )
153
163
 
154
164
  # Write the audio data to the stream
155
165
  stream.write(buffer)
@@ -157,6 +167,8 @@ async def write_stream():
157
167
  stream.stop_stream()
158
168
  stream.close()
159
169
  p.terminate()
170
+ await client.close()
171
+
160
172
 
161
173
  asyncio.run(write_stream())
162
174
  ```
@@ -193,14 +205,17 @@ stream = None
193
205
  ws = client.tts.websocket()
194
206
 
195
207
  # Generate and stream audio using the websocket
196
- for output in ws.send(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format):
208
+ for output in ws.send(
209
+ model_id=model_id,
210
+ transcript=transcript,
211
+ voice_embedding=voice["embedding"],
212
+ stream=True,
213
+ output_format=output_format,
214
+ ):
197
215
  buffer = output["audio"]
198
216
 
199
217
  if not stream:
200
- stream = p.open(format=pyaudio.paFloat32,
201
- channels=1,
202
- rate=rate,
203
- output=True)
218
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
204
219
 
205
220
  # Write the audio data to the stream
206
221
  stream.write(buffer)
@@ -209,7 +224,7 @@ stream.stop_stream()
209
224
  stream.close()
210
225
  p.terminate()
211
226
 
212
- ws.close() # Close the websocket connection
227
+ ws.close() # Close the websocket connection
213
228
  ```
214
229
 
215
230
  ### Multilingual Text-to-Speech [Alpha]
@@ -245,14 +260,18 @@ rate = 44100
245
260
  stream = None
246
261
 
247
262
  # Pass in the corresponding language code to the `language` parameter to generate and stream audio.
248
- for output in client.tts.sse(model_id=model_id, transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format, language=language):
263
+ for output in client.tts.sse(
264
+ model_id=model_id,
265
+ transcript=transcript,
266
+ voice_embedding=voice["embedding"],
267
+ stream=True,
268
+ output_format=output_format,
269
+ language=language,
270
+ ):
249
271
  buffer = output["audio"]
250
272
 
251
273
  if not stream:
252
- stream = p.open(format=pyaudio.paFloat32,
253
- channels=1,
254
- rate=rate,
255
- output=True)
274
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
256
275
 
257
276
  stream.write(buffer)
258
277
 
@@ -287,7 +306,12 @@ with Cartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
287
306
  audio_data = io.BytesIO()
288
307
 
289
308
  # Generate and stream audio
290
- for output in client.tts.sse(model_id="sonic-english", transcript=transcript, voice_embedding=voice["embedding"], stream=True, output_format=output_format
309
+ for output in client.tts.sse(
310
+ model_id="sonic-english",
311
+ transcript=transcript,
312
+ voice_embedding=voice["embedding"],
313
+ stream=True,
314
+ output_format=output_format,
291
315
  ):
292
316
  buffer = output["audio"]
293
317
  audio_data.write(buffer)
@@ -326,7 +350,12 @@ async with AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
326
350
  audio_data = io.BytesIO()
327
351
 
328
352
  # Generate and stream audio
329
- async for output in client.tts.sse(model_id="sonic-english", transcript=transcript, voice_id=voice_id, stream=True, output_format=output_format
353
+ async for output in client.tts.sse(
354
+ model_id="sonic-english",
355
+ transcript=transcript,
356
+ voice_id=voice_id,
357
+ stream=True,
358
+ output_format=output_format,
330
359
  ):
331
360
  buffer = output["audio"]
332
361
  audio_data.write(buffer)
@@ -341,6 +370,28 @@ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
341
370
  display(audio)
342
371
  ```
343
372
 
373
+ ### Utility methods
374
+
375
+ #### Output Formats
376
+
377
+ You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
378
+
379
+ The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
380
+
381
+ ```python
382
+ # Get the output format dictionary from string name
383
+ output_format = client.tts.get_output_format("raw_pcm_f32le_44100")
384
+
385
+ # Pass in the output format dictionary to generate and stream audio
386
+ generator = client.tts.sse(
387
+ model_id=model,
388
+ transcript=transcript,
389
+ voice_id=SAMPLE_VOICE_ID,
390
+ stream=True,
391
+ output_format=output_format,
392
+ )
393
+ ```
394
+
344
395
  To avoid storing your API key in the source code, we recommend doing one of the following:
345
396
 
346
397
  1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
@@ -4,11 +4,11 @@ setup.py
4
4
  cartesia/__init__.py
5
5
  cartesia/_types.py
6
6
  cartesia/client.py
7
- cartesia/utils.py
8
7
  cartesia/version.py
9
8
  cartesia.egg-info/PKG-INFO
10
9
  cartesia.egg-info/SOURCES.txt
11
10
  cartesia.egg-info/dependency_links.txt
12
11
  cartesia.egg-info/requires.txt
13
12
  cartesia.egg-info/top_level.txt
13
+ tests/test_deprecated.py
14
14
  tests/test_tts.py
@@ -0,0 +1,21 @@
1
+ from packaging.version import Version
2
+
3
+ import cartesia as Cartesia
4
+ from cartesia.utils.deprecated import _DEPRECATED_FUNCTION_STATS
5
+ import cartesia.version as version
6
+
7
+
8
+ def test_deprecated_to_remove_by_version():
9
+ """
10
+ Test that all deprecated functions that are listed to be
11
+ removed by the current version are removed.
12
+ """
13
+ versions_to_remove = [x["vremove"] for x in _DEPRECATED_FUNCTION_STATS]
14
+ versions_to_remove = [Version(x) for x in versions_to_remove if x is not None]
15
+
16
+ curr_version = Version(version.__version__)
17
+
18
+ assert all(v > curr_version for v in versions_to_remove)
19
+
20
+ # This test is taken from the following source:
21
+ # https://github.com/ad12/meddlr/blob/main/tests/utils/test_deprecated.py
@@ -1,37 +0,0 @@
1
- from typing import List, TypedDict
2
-
3
- class OutputFormatMapping:
4
- _format_mapping = {
5
- "fp32": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
6
- "pcm": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
7
- "fp32_16000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 16000},
8
- "fp32_22050": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 22050},
9
- "fp32_44100": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
10
- "pcm_16000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
11
- "pcm_22050": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 22050},
12
- "pcm_44100": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
13
- "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
14
- "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
15
- }
16
-
17
- @classmethod
18
- def get_format(cls, format_name):
19
- if format_name in cls._format_mapping:
20
- return cls._format_mapping[format_name]
21
- else:
22
- raise ValueError(f"Unsupported format: {format_name}")
23
-
24
- class VoiceMetadata(TypedDict):
25
- id: str
26
- name: str
27
- description: str
28
- embedding: List[float]
29
- is_public: bool
30
- user_id: str
31
- created_at: str
32
- language: str
33
-
34
- class OutputFormat(TypedDict):
35
- container: str
36
- encoding: str
37
- sample_rate: int
@@ -1,87 +0,0 @@
1
- import time
2
-
3
- from aiohttp.client_exceptions import ServerDisconnectedError
4
- import asyncio
5
- from functools import wraps
6
- from http.client import RemoteDisconnected
7
- from httpx import TimeoutException
8
- from requests.exceptions import ConnectionError
9
-
10
-
11
- def retry_on_connection_error(max_retries=3, backoff_factor=1, logger=None):
12
- """Retry a function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
13
-
14
- Args:
15
- max_retries (int): The maximum number of retries.
16
- backoff_factor (int): The factor to increase the delay between retries.
17
- logger (logging.Logger): The logger to use for logging.
18
- """
19
-
20
- def decorator(func):
21
- @wraps(func)
22
- def wrapper(*args, **kwargs):
23
- retry_count = 0
24
- while retry_count < max_retries:
25
- try:
26
- return func(*args, **kwargs)
27
- except (
28
- ConnectionError,
29
- RemoteDisconnected,
30
- ServerDisconnectedError,
31
- TimeoutException,
32
- ) as e:
33
- logger.info(f"Retrying after exception: {e}")
34
- retry_count += 1
35
- if retry_count < max_retries:
36
- delay = backoff_factor * (2 ** (retry_count - 1))
37
- logger.warn(
38
- f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds..."
39
- )
40
- time.sleep(delay)
41
- else:
42
- raise Exception(f"Exception occurred after {max_retries} tries.") from e
43
-
44
- return wrapper
45
-
46
- return decorator
47
-
48
-
49
- def retry_on_connection_error_async(max_retries=3, backoff_factor=1, logger=None):
50
- """Retry an asynchronous function if a ConnectionError, RemoteDisconnected, ServerDisconnectedError, or TimeoutException occurs.
51
-
52
- Args:
53
- max_retries (int): The maximum number of retries.
54
- backoff_factor (int): The factor to increase the delay between retries.
55
- logger (logging.Logger): The logger to use for logging.
56
- """
57
-
58
- def decorator(func):
59
- @wraps(func)
60
- async def wrapper(*args, **kwargs):
61
- retry_count = 0
62
- while retry_count < max_retries:
63
- try:
64
- async for chunk in func(*args, **kwargs):
65
- yield chunk
66
- # If the function completes without raising an exception return
67
- return
68
- except (
69
- ConnectionError,
70
- RemoteDisconnected,
71
- ServerDisconnectedError,
72
- TimeoutException,
73
- ) as e:
74
- logger.info(f"Retrying after exception: {e}")
75
- retry_count += 1
76
- if retry_count < max_retries:
77
- delay = backoff_factor * (2 ** (retry_count - 1))
78
- logger.warn(
79
- f"Attempt {retry_count + 1}/{max_retries} in {delay} seconds..."
80
- )
81
- await asyncio.sleep(delay)
82
- else:
83
- raise Exception(f"Exception occurred after {max_retries} tries.") from e
84
-
85
- return wrapper
86
-
87
- return decorator
@@ -1 +0,0 @@
1
- __version__ = "1.0.0"
File without changes
File without changes
File without changes
File without changes
File without changes