cartesia 1.0.4__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia-1.0.6/LICENSE.md +21 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/PKG-INFO +92 -13
- {cartesia-1.0.4 → cartesia-1.0.6}/README.md +90 -12
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia/_types.py +24 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia/client.py +284 -76
- cartesia-1.0.6/cartesia/version.py +1 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia.egg-info/PKG-INFO +92 -13
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia.egg-info/SOURCES.txt +1 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia.egg-info/requires.txt +1 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/tests/test_tts.py +137 -22
- cartesia-1.0.4/cartesia/version.py +0 -1
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia/__init__.py +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia/utils/retry.py +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/pyproject.toml +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/setup.cfg +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/setup.py +0 -0
- {cartesia-1.0.4 → cartesia-1.0.6}/tests/test_deprecated.py +0 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Cartesia AI, Inc.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.6
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -12,6 +12,7 @@ Requires-Python: >=3.8.0
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
Provides-Extra: dev
|
14
14
|
Provides-Extra: all
|
15
|
+
License-File: LICENSE.md
|
15
16
|
|
16
17
|
|
17
18
|
# Cartesia Python API Library
|
@@ -80,10 +81,10 @@ voice = client.voices.get(id=voice_id)
|
|
80
81
|
|
81
82
|
transcript = "Hello! Welcome to Cartesia"
|
82
83
|
|
83
|
-
# You can check out our models at
|
84
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
84
85
|
model_id = "sonic-english"
|
85
86
|
|
86
|
-
# You can find the supported `output_format`s
|
87
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
87
88
|
output_format = {
|
88
89
|
"container": "raw",
|
89
90
|
"encoding": "pcm_f32le",
|
@@ -131,10 +132,10 @@ async def write_stream():
|
|
131
132
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
132
133
|
voice = client.voices.get(id=voice_id)
|
133
134
|
transcript = "Hello! Welcome to Cartesia"
|
134
|
-
# You can check out our models at
|
135
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
135
136
|
model_id = "sonic-english"
|
136
137
|
|
137
|
-
# You can find the supported `output_format`s
|
138
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
138
139
|
output_format = {
|
139
140
|
"container": "raw",
|
140
141
|
"encoding": "pcm_f32le",
|
@@ -186,10 +187,10 @@ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
|
186
187
|
voice = client.voices.get(id=voice_id)
|
187
188
|
transcript = "Hello! Welcome to Cartesia"
|
188
189
|
|
189
|
-
# You can check out our models at
|
190
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
190
191
|
model_id = "sonic-english"
|
191
192
|
|
192
|
-
# You can find the supported `output_format`s
|
193
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
193
194
|
output_format = {
|
194
195
|
"container": "raw",
|
195
196
|
"encoding": "pcm_f32le",
|
@@ -233,7 +234,7 @@ In some cases, input text may need to be streamed in. In these cases, it would b
|
|
233
234
|
|
234
235
|
To mitigate this, Cartesia offers audio continuations. In this setting, users can send input text, as it becomes available, over a websocket connection.
|
235
236
|
|
236
|
-
To do this, we will create a `context` and
|
237
|
+
To do this, we will create a `context` and send multiple requests without awaiting the response. Then you can listen to the responses in the order they were sent.
|
237
238
|
|
238
239
|
Each `context` will be closed automatically after 5 seconds of inactivity or when the `no_more_inputs` method is called. `no_more_inputs` sends a request with the `continue_=False`, which indicates no more inputs will be sent over this context
|
239
240
|
|
@@ -244,13 +245,13 @@ import pyaudio
|
|
244
245
|
from cartesia import AsyncCartesia
|
245
246
|
|
246
247
|
async def send_transcripts(ctx):
|
247
|
-
# Check out voice IDs by calling `client.voices.list()` or on
|
248
|
+
# Check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
|
248
249
|
voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
|
249
250
|
|
250
|
-
# You can check out our models at
|
251
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
251
252
|
model_id = "sonic-english"
|
252
253
|
|
253
|
-
# You can find the supported `output_format`s
|
254
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
254
255
|
output_format = {
|
255
256
|
"container": "raw",
|
256
257
|
"encoding": "pcm_f32le",
|
@@ -322,6 +323,84 @@ async def stream_and_listen():
|
|
322
323
|
asyncio.run(stream_and_listen())
|
323
324
|
```
|
324
325
|
|
326
|
+
You can also use continuations on the synchronous Cartesia client to stream in text as it becomes available. To do this, pass in a text generator that produces text chunks at intervals of less than 1 second, as shown below. This ensures smooth audio playback.
|
327
|
+
|
328
|
+
Note: the sync client has a different API for continuations compared to the async client.
|
329
|
+
|
330
|
+
```python
|
331
|
+
from cartesia import Cartesia
|
332
|
+
import pyaudio
|
333
|
+
import os
|
334
|
+
|
335
|
+
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
336
|
+
|
337
|
+
transcripts = [
|
338
|
+
"The crew engaged in a range of activities designed to mirror those "
|
339
|
+
"they might perform on a real Mars mission. ",
|
340
|
+
"Aside from growing vegetables and maintaining their habitat, they faced "
|
341
|
+
"additional stressors like communication delays with Earth, ",
|
342
|
+
"up to twenty-two minutes each way, to simulate the distance from Mars to our planet. ",
|
343
|
+
"These exercises were critical for understanding how astronauts can "
|
344
|
+
"maintain not just physical health but also mental well-being under such challenging conditions. ",
|
345
|
+
]
|
346
|
+
|
347
|
+
# Ending each transcript with a space makes the audio smoother
|
348
|
+
def chunk_generator(transcripts):
|
349
|
+
for transcript in transcripts:
|
350
|
+
if transcript.endswith(" "):
|
351
|
+
yield transcript
|
352
|
+
else:
|
353
|
+
yield transcript + " "
|
354
|
+
|
355
|
+
|
356
|
+
# You can check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
|
357
|
+
voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
|
358
|
+
|
359
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
360
|
+
model_id = "sonic-english"
|
361
|
+
|
362
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
363
|
+
output_format = {
|
364
|
+
"container": "raw",
|
365
|
+
"encoding": "pcm_f32le",
|
366
|
+
"sample_rate": 44100,
|
367
|
+
}
|
368
|
+
|
369
|
+
p = pyaudio.PyAudio()
|
370
|
+
rate = 44100
|
371
|
+
|
372
|
+
stream = None
|
373
|
+
|
374
|
+
# Set up the websocket connection
|
375
|
+
ws = client.tts.websocket()
|
376
|
+
|
377
|
+
# Create a context to send and receive audio
|
378
|
+
ctx = ws.context() # Generates a random context ID if not provided
|
379
|
+
|
380
|
+
# Pass in a text generator to generate & stream the audio
|
381
|
+
output_stream = ctx.send(
|
382
|
+
model_id=model_id,
|
383
|
+
transcript=chunk_generator(transcripts),
|
384
|
+
voice_id=voice_id,
|
385
|
+
output_format=output_format,
|
386
|
+
)
|
387
|
+
|
388
|
+
for output in output_stream:
|
389
|
+
buffer = output["audio"]
|
390
|
+
|
391
|
+
if not stream:
|
392
|
+
stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
|
393
|
+
|
394
|
+
# Write the audio data to the stream
|
395
|
+
stream.write(buffer)
|
396
|
+
|
397
|
+
stream.stop_stream()
|
398
|
+
stream.close()
|
399
|
+
p.terminate()
|
400
|
+
|
401
|
+
ws.close() # Close the websocket connection
|
402
|
+
```
|
403
|
+
|
325
404
|
### Multilingual Text-to-Speech [Alpha]
|
326
405
|
|
327
406
|
You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
|
@@ -339,10 +418,10 @@ voice = client.voices.get(id=voice_id)
|
|
339
418
|
transcript = "Hola! Bienvenido a Cartesia"
|
340
419
|
language = "es" # Language code corresponding to the language of the transcript
|
341
420
|
|
342
|
-
# Make sure you use the multilingual model! You can check out all models at
|
421
|
+
# Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
|
343
422
|
model_id = "sonic-multilingual"
|
344
423
|
|
345
|
-
# You can find the supported `output_format`s
|
424
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
346
425
|
output_format = {
|
347
426
|
"container": "raw",
|
348
427
|
"encoding": "pcm_f32le",
|
@@ -64,10 +64,10 @@ voice = client.voices.get(id=voice_id)
|
|
64
64
|
|
65
65
|
transcript = "Hello! Welcome to Cartesia"
|
66
66
|
|
67
|
-
# You can check out our models at
|
67
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
68
68
|
model_id = "sonic-english"
|
69
69
|
|
70
|
-
# You can find the supported `output_format`s
|
70
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
71
71
|
output_format = {
|
72
72
|
"container": "raw",
|
73
73
|
"encoding": "pcm_f32le",
|
@@ -115,10 +115,10 @@ async def write_stream():
|
|
115
115
|
voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
116
116
|
voice = client.voices.get(id=voice_id)
|
117
117
|
transcript = "Hello! Welcome to Cartesia"
|
118
|
-
# You can check out our models at
|
118
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
119
119
|
model_id = "sonic-english"
|
120
120
|
|
121
|
-
# You can find the supported `output_format`s
|
121
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
122
122
|
output_format = {
|
123
123
|
"container": "raw",
|
124
124
|
"encoding": "pcm_f32le",
|
@@ -170,10 +170,10 @@ voice_id = "a0e99841-438c-4a64-b679-ae501e7d6091"
|
|
170
170
|
voice = client.voices.get(id=voice_id)
|
171
171
|
transcript = "Hello! Welcome to Cartesia"
|
172
172
|
|
173
|
-
# You can check out our models at
|
173
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
174
174
|
model_id = "sonic-english"
|
175
175
|
|
176
|
-
# You can find the supported `output_format`s
|
176
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
177
177
|
output_format = {
|
178
178
|
"container": "raw",
|
179
179
|
"encoding": "pcm_f32le",
|
@@ -217,7 +217,7 @@ In some cases, input text may need to be streamed in. In these cases, it would b
|
|
217
217
|
|
218
218
|
To mitigate this, Cartesia offers audio continuations. In this setting, users can send input text, as it becomes available, over a websocket connection.
|
219
219
|
|
220
|
-
To do this, we will create a `context` and
|
220
|
+
To do this, we will create a `context` and send multiple requests without awaiting the response. Then you can listen to the responses in the order they were sent.
|
221
221
|
|
222
222
|
Each `context` will be closed automatically after 5 seconds of inactivity or when the `no_more_inputs` method is called. `no_more_inputs` sends a request with the `continue_=False`, which indicates no more inputs will be sent over this context
|
223
223
|
|
@@ -228,13 +228,13 @@ import pyaudio
|
|
228
228
|
from cartesia import AsyncCartesia
|
229
229
|
|
230
230
|
async def send_transcripts(ctx):
|
231
|
-
# Check out voice IDs by calling `client.voices.list()` or on
|
231
|
+
# Check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
|
232
232
|
voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
|
233
233
|
|
234
|
-
# You can check out our models at
|
234
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
235
235
|
model_id = "sonic-english"
|
236
236
|
|
237
|
-
# You can find the supported `output_format`s
|
237
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
238
238
|
output_format = {
|
239
239
|
"container": "raw",
|
240
240
|
"encoding": "pcm_f32le",
|
@@ -306,6 +306,84 @@ async def stream_and_listen():
|
|
306
306
|
asyncio.run(stream_and_listen())
|
307
307
|
```
|
308
308
|
|
309
|
+
You can also use continuations on the synchronous Cartesia client to stream in text as it becomes available. To do this, pass in a text generator that produces text chunks at intervals of less than 1 second, as shown below. This ensures smooth audio playback.
|
310
|
+
|
311
|
+
Note: the sync client has a different API for continuations compared to the async client.
|
312
|
+
|
313
|
+
```python
|
314
|
+
from cartesia import Cartesia
|
315
|
+
import pyaudio
|
316
|
+
import os
|
317
|
+
|
318
|
+
client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))
|
319
|
+
|
320
|
+
transcripts = [
|
321
|
+
"The crew engaged in a range of activities designed to mirror those "
|
322
|
+
"they might perform on a real Mars mission. ",
|
323
|
+
"Aside from growing vegetables and maintaining their habitat, they faced "
|
324
|
+
"additional stressors like communication delays with Earth, ",
|
325
|
+
"up to twenty-two minutes each way, to simulate the distance from Mars to our planet. ",
|
326
|
+
"These exercises were critical for understanding how astronauts can "
|
327
|
+
"maintain not just physical health but also mental well-being under such challenging conditions. ",
|
328
|
+
]
|
329
|
+
|
330
|
+
# Ending each transcript with a space makes the audio smoother
|
331
|
+
def chunk_generator(transcripts):
|
332
|
+
for transcript in transcripts:
|
333
|
+
if transcript.endswith(" "):
|
334
|
+
yield transcript
|
335
|
+
else:
|
336
|
+
yield transcript + " "
|
337
|
+
|
338
|
+
|
339
|
+
# You can check out voice IDs by calling `client.voices.list()` or on https://play.cartesia.ai/
|
340
|
+
voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
|
341
|
+
|
342
|
+
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
343
|
+
model_id = "sonic-english"
|
344
|
+
|
345
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
346
|
+
output_format = {
|
347
|
+
"container": "raw",
|
348
|
+
"encoding": "pcm_f32le",
|
349
|
+
"sample_rate": 44100,
|
350
|
+
}
|
351
|
+
|
352
|
+
p = pyaudio.PyAudio()
|
353
|
+
rate = 44100
|
354
|
+
|
355
|
+
stream = None
|
356
|
+
|
357
|
+
# Set up the websocket connection
|
358
|
+
ws = client.tts.websocket()
|
359
|
+
|
360
|
+
# Create a context to send and receive audio
|
361
|
+
ctx = ws.context() # Generates a random context ID if not provided
|
362
|
+
|
363
|
+
# Pass in a text generator to generate & stream the audio
|
364
|
+
output_stream = ctx.send(
|
365
|
+
model_id=model_id,
|
366
|
+
transcript=chunk_generator(transcripts),
|
367
|
+
voice_id=voice_id,
|
368
|
+
output_format=output_format,
|
369
|
+
)
|
370
|
+
|
371
|
+
for output in output_stream:
|
372
|
+
buffer = output["audio"]
|
373
|
+
|
374
|
+
if not stream:
|
375
|
+
stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True)
|
376
|
+
|
377
|
+
# Write the audio data to the stream
|
378
|
+
stream.write(buffer)
|
379
|
+
|
380
|
+
stream.stop_stream()
|
381
|
+
stream.close()
|
382
|
+
p.terminate()
|
383
|
+
|
384
|
+
ws.close() # Close the websocket connection
|
385
|
+
```
|
386
|
+
|
309
387
|
### Multilingual Text-to-Speech [Alpha]
|
310
388
|
|
311
389
|
You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
|
@@ -323,10 +401,10 @@ voice = client.voices.get(id=voice_id)
|
|
323
401
|
transcript = "Hola! Bienvenido a Cartesia"
|
324
402
|
language = "es" # Language code corresponding to the language of the transcript
|
325
403
|
|
326
|
-
# Make sure you use the multilingual model! You can check out all models at
|
404
|
+
# Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
|
327
405
|
model_id = "sonic-multilingual"
|
328
406
|
|
329
|
-
# You can find the supported `output_format`s
|
407
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
|
330
408
|
output_format = {
|
331
409
|
"container": "raw",
|
332
410
|
"encoding": "pcm_f32le",
|
@@ -70,7 +70,31 @@ class VoiceMetadata(TypedDict):
|
|
70
70
|
language: str
|
71
71
|
|
72
72
|
|
73
|
+
class VoiceControls(TypedDict):
|
74
|
+
"""Defines different voice control parameters for voice synthesis.
|
75
|
+
|
76
|
+
For a complete list of supported parameters, refer to the Cartesia API documentation.
|
77
|
+
https://docs.cartesia.ai/getting-started/welcome
|
78
|
+
|
79
|
+
Examples:
|
80
|
+
>>> {"speed": "fastest"}
|
81
|
+
>>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
|
82
|
+
>>> {"emotion": "surprise:high, positivity:high"}
|
83
|
+
|
84
|
+
Note:
|
85
|
+
This is an experimental class and is subject to rapid change in future versions.
|
86
|
+
"""
|
87
|
+
speed: str = ""
|
88
|
+
emotion: str = ""
|
89
|
+
|
90
|
+
|
73
91
|
class OutputFormat(TypedDict):
|
74
92
|
container: str
|
75
93
|
encoding: str
|
76
94
|
sample_rate: int
|
95
|
+
|
96
|
+
|
97
|
+
class EventType:
|
98
|
+
NULL = ""
|
99
|
+
AUDIO = "chunk"
|
100
|
+
TIMESTAMPS = "timestamps"
|