cartesia 0.1.1__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,398 @@
1
+ Metadata-Version: 2.1
2
+ Name: cartesia
3
+ Version: 1.0.1
4
+ Summary: The official Python library for the Cartesia API.
5
+ Home-page:
6
+ Author: Cartesia, Inc.
7
+ Author-email: support@cartesia.ai
8
+ Classifier: Programming Language :: Python
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Requires-Python: >=3.8.0
12
+ Description-Content-Type: text/markdown
13
+ Provides-Extra: dev
14
+ Provides-Extra: all
15
+
16
+
17
+ # Cartesia Python API Library
18
+
19
+ ![PyPI - Version](https://img.shields.io/pypi/v/cartesia)
20
+ [![Discord](https://badgen.net/badge/black/Cartesia/icon?icon=discord&label)](https://discord.gg/ZVxavqHB9X)
21
+
22
+ The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
23
+
24
+ > [!IMPORTANT]
25
+ > The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/ZVxavqHB9X) for any support requests!
26
+
27
+ ## Documentation
28
+
29
+ Our complete API documentation can be found [on docs.cartesia.ai](https://docs.cartesia.ai).
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install cartesia
35
+
36
+ # pip install in editable mode w/ dev dependencies
37
+ pip install -e '.[dev]'
38
+ ```
39
+
40
+ ## Voices
41
+
42
+ ```python
43
+ from cartesia import Cartesia
44
+ import os
45
+
46
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
47
+
48
+ # Get all available voices
49
+ voices = client.voices.list()
50
+ print(voices)
51
+
52
+ # Get a specific voice
53
+ voice = client.voices.get(id="a0e99841-438c-4a64-b679-ae501e7d6091")
54
+ print("The embedding for", voice["name"], "is", voice["embedding"])
55
+
56
+ # Clone a voice using filepath
57
+ cloned_voice_embedding = client.voices.clone(filepath="path/to/voice")
58
+
59
+ # Create a new voice
60
+ new_voice = client.voices.create(
61
+ name="New Voice",
62
+ description="A clone of my own voice",
63
+ embedding=cloned_voice_embedding,
64
+ )
65
+ ```
66
+
67
+ ## Text-to-Speech
68
+
69
+ ### Server-Sent Events (SSE)
70
+
71
+ ```python
72
+ from cartesia import Cartesia
73
+ import pyaudio
74
+ import os
75
+
76
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
77
+ voice_name = "Barbershop Man"
78
+ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
79
+ voice = client.voices.get(id=voice_id)
80
+
81
+ transcript = "Hello! Welcome to Cartesia"
82
+
83
+ # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
84
+ model_id = "sonic-english"
85
+
86
+ # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
87
+ output_format = {
88
+ "container": "raw",
89
+ "encoding": "pcm_f32le",
90
+ "sample_rate": 44100,
91
+ }
92
+
93
+ p = pyaudio.PyAudio()
94
+ rate = 44100
95
+
96
+ stream = None
97
+
98
+ # Generate and stream audio
99
+ for output in client.tts.sse(
100
+ model_id=model_id,
101
+ transcript=transcript,
102
+ voice_embedding=voice["embedding"],
103
+ stream=True,
104
+ output_format=output_format,
105
+ ):
106
+ buffer = output["audio"]
107
+
108
+ if not stream:
109
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
110
+
111
+ # Write the audio data to the stream
112
+ stream.write(buffer)
113
+
114
+ stream.stop_stream()
115
+ stream.close()
116
+ p.terminate()
117
+ ```
118
+
119
+ You can also use the async client if you want to make asynchronous API calls. Simply import `AsyncCartesia` instead of `Cartesia` and use await with each API call:
120
+
121
+ ```python
122
+ from cartesia import AsyncCartesia
123
+ import asyncio
124
+ import pyaudio
125
+ import os
126
+
127
+
128
+ async def write_stream():
129
+ client = AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
130
+ voice_name = "Barbershop Man"
131
+ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
132
+ voice = client.voices.get(id=voice_id)
133
+ transcript = "Hello! Welcome to Cartesia"
134
+ # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
135
+ model_id = "sonic-english"
136
+
137
+ # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
138
+ output_format = {
139
+ "container": "raw",
140
+ "encoding": "pcm_f32le",
141
+ "sample_rate": 44100,
142
+ }
143
+
144
+ p = pyaudio.PyAudio()
145
+ rate = 44100
146
+
147
+ stream = None
148
+
149
+ # Generate and stream audio
150
+ async for output in await client.tts.sse(
151
+ model_id=model_id,
152
+ transcript=transcript,
153
+ voice_embedding=voice["embedding"],
154
+ stream=True,
155
+ output_format=output_format,
156
+ ):
157
+ buffer = output["audio"]
158
+
159
+ if not stream:
160
+ stream = p.open(
161
+ format=pyaudio.paFloat32, channels=1, rate=rate, output=True
162
+ )
163
+
164
+ # Write the audio data to the stream
165
+ stream.write(buffer)
166
+
167
+ stream.stop_stream()
168
+ stream.close()
169
+ p.terminate()
170
+ await client.close()
171
+
172
+
173
+ asyncio.run(write_stream())
174
+ ```
175
+
176
+ ### WebSocket
177
+
178
+ ```python
179
+ from cartesia import Cartesia
180
+ import pyaudio
181
+ import os
182
+
183
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
184
+ voice_name = "Barbershop Man"
185
+ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
186
+ voice = client.voices.get(id=voice_id)
187
+ transcript = "Hello! Welcome to Cartesia"
188
+
189
+ # You can check out our models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
190
+ model_id = "sonic-english"
191
+
192
+ # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
193
+ output_format = {
194
+ "container": "raw",
195
+ "encoding": "pcm_f32le",
196
+ "sample_rate": 22050,
197
+ }
198
+
199
+ p = pyaudio.PyAudio()
200
+ rate = 22050
201
+
202
+ stream = None
203
+
204
+ # Set up the websocket connection
205
+ ws = client.tts.websocket()
206
+
207
+ # Generate and stream audio using the websocket
208
+ for output in ws.send(
209
+ model_id=model_id,
210
+ transcript=transcript,
211
+ voice_embedding=voice["embedding"],
212
+ stream=True,
213
+ output_format=output_format,
214
+ ):
215
+ buffer = output["audio"]
216
+
217
+ if not stream:
218
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
219
+
220
+ # Write the audio data to the stream
221
+ stream.write(buffer)
222
+
223
+ stream.stop_stream()
224
+ stream.close()
225
+ p.terminate()
226
+
227
+ ws.close() # Close the websocket connection
228
+ ```
229
+
230
+ ### Multilingual Text-to-Speech [Alpha]
231
+
232
+ You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
233
+
234
+ ```python
235
+ from cartesia import Cartesia
236
+ import pyaudio
237
+ import os
238
+
239
+ client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
240
+ voice_name = "Barbershop Man"
241
+ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
242
+ voice = client.voices.get(id=voice_id)
243
+
244
+ transcript = "Hola! Bienvenido a Cartesia"
245
+ language = "es" # Language code corresponding to the language of the transcript
246
+
247
+ # Make sure you use the multilingual model! You can check out all models at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
248
+ model_id = "sonic-multilingual"
249
+
250
+ # You can find the supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
251
+ output_format = {
252
+ "container": "raw",
253
+ "encoding": "pcm_f32le",
254
+ "sample_rate": 44100,
255
+ }
256
+
257
+ p = pyaudio.PyAudio()
258
+ rate = 44100
259
+
260
+ stream = None
261
+
262
+ # Pass in the corresponding language code to the `language` parameter to generate and stream audio.
263
+ for output in client.tts.sse(
264
+ model_id=model_id,
265
+ transcript=transcript,
266
+ voice_embedding=voice["embedding"],
267
+ stream=True,
268
+ output_format=output_format,
269
+ language=language,
270
+ ):
271
+ buffer = output["audio"]
272
+
273
+ if not stream:
274
+ stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
275
+
276
+ stream.write(buffer)
277
+
278
+ stream.stop_stream()
279
+ stream.close()
280
+ p.terminate()
281
+ ```
282
+
283
+ If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
284
+ Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
285
+
286
+ ```python
287
+ from IPython.display import Audio
288
+ import io
289
+ import os
290
+ import numpy as np
291
+
292
+ from cartesia import Cartesia
293
+
294
+ with Cartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
295
+ output_format = {
296
+ "container": "raw",
297
+ "encoding": "pcm_f32le",
298
+ "sample_rate": 8000,
299
+ }
300
+ rate = 8000
301
+ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
302
+ voice = client.voices.get(id=voice_id)
303
+ transcript = "Hey there! Welcome to Cartesia"
304
+
305
+ # Create a BytesIO object to store the audio data
306
+ audio_data = io.BytesIO()
307
+
308
+ # Generate and stream audio
309
+ for output in client.tts.sse(
310
+ model_id="sonic-english",
311
+ transcript=transcript,
312
+ voice_embedding=voice["embedding"],
313
+ stream=True,
314
+ output_format=output_format,
315
+ ):
316
+ buffer = output["audio"]
317
+ audio_data.write(buffer)
318
+
319
+ # Set the cursor position to the beginning of the BytesIO object
320
+ audio_data.seek(0)
321
+
322
+ # Create an Audio object from the BytesIO data
323
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
324
+
325
+ # Display the Audio object
326
+ display(audio)
327
+ ```
328
+
329
+ Below is the same example using the async client:
330
+
331
+ ```python
332
+ from IPython.display import Audio
333
+ import io
334
+ import os
335
+ import numpy as np
336
+
337
+ from cartesia import AsyncCartesia
338
+
339
+ async with AsyncCartesia(api_key=os.environ.get("CARTESIA_API_KEY")) as client:
340
+ output_format = {
341
+ "container": "raw",
342
+ "encoding": "pcm_f32le",
343
+ "sample_rate": 8000,
344
+ }
345
+ rate = 8000
346
+ voice_id = "248be419-c632-4f23-adf1-5324ed7dbf1d"
347
+ transcript = "Hey there! Welcome to Cartesia"
348
+
349
+ # Create a BytesIO object to store the audio data
350
+ audio_data = io.BytesIO()
351
+
352
+ # Generate and stream audio
353
+ async for output in client.tts.sse(
354
+ model_id="sonic-english",
355
+ transcript=transcript,
356
+ voice_id=voice_id,
357
+ stream=True,
358
+ output_format=output_format,
359
+ ):
360
+ buffer = output["audio"]
361
+ audio_data.write(buffer)
362
+
363
+ # Set the cursor position to the beginning of the BytesIO object
364
+ audio_data.seek(0)
365
+
366
+ # Create an Audio object from the BytesIO data
367
+ audio = Audio(np.frombuffer(audio_data.read(), dtype=np.float32), rate=rate)
368
+
369
+ # Display the Audio object
370
+ display(audio)
371
+ ```
372
+
373
+ ### Utility methods
374
+
375
+ #### Output Formats
376
+
377
+ You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events).
378
+
379
+ The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
380
+
381
+ ```python
382
+ # Get the output format dictionary from string name
383
+ output_format = client.tts.get_output_format("raw_pcm_f32le_44100")
384
+
385
+ # Pass in the output format dictionary to generate and stream audio
386
+ generator = client.tts.sse(
387
+ model_id=model,
388
+ transcript=transcript,
389
+ voice_id=SAMPLE_VOICE_ID,
390
+ stream=True,
391
+ output_format=output_format,
392
+ )
393
+ ```
394
+
395
+ To avoid storing your API key in the source code, we recommend doing one of the following:
396
+
397
+ 1. Use [`python-dotenv`](https://pypi.org/project/python-dotenv/) to add `CARTESIA_API_KEY="my-api-key"` to your .env file.
398
+ 1. Set the `CARTESIA_API_KEY` environment variable, preferably to a secure shell init file (e.g. `~/.zshrc`, `~/.bashrc`)