cartesia 1.0.13__tar.gz → 1.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-1.0.13 → cartesia-1.0.14}/PKG-INFO +10 -10
- {cartesia-1.0.13 → cartesia-1.0.14}/README.md +9 -9
- cartesia-1.0.14/cartesia/__init__.py +4 -0
- cartesia-1.0.14/cartesia/_async_sse.py +105 -0
- cartesia-1.0.14/cartesia/_async_websocket.py +323 -0
- cartesia-1.0.14/cartesia/_constants.py +10 -0
- cartesia-1.0.14/cartesia/_logger.py +3 -0
- cartesia-1.0.14/cartesia/_sse.py +152 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia/_types.py +3 -2
- cartesia-1.0.14/cartesia/_websocket.py +374 -0
- cartesia-1.0.14/cartesia/async_client.py +82 -0
- cartesia-1.0.14/cartesia/async_tts.py +22 -0
- cartesia-1.0.14/cartesia/client.py +69 -0
- cartesia-1.0.14/cartesia/resource.py +44 -0
- cartesia-1.0.14/cartesia/tts.py +109 -0
- cartesia-1.0.14/cartesia/utils/tts.py +25 -0
- cartesia-1.0.14/cartesia/version.py +1 -0
- cartesia-1.0.14/cartesia/voices.py +170 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia.egg-info/PKG-INFO +10 -10
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia.egg-info/SOURCES.txt +12 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia.egg-info/requires.txt +4 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/tests/test_tts.py +69 -8
- cartesia-1.0.13/cartesia/__init__.py +0 -3
- cartesia-1.0.13/cartesia/client.py +0 -1393
- cartesia-1.0.13/cartesia/version.py +0 -1
- {cartesia-1.0.13 → cartesia-1.0.14}/LICENSE.md +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia/utils/retry.py +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/pyproject.toml +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/setup.cfg +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/setup.py +0 -0
- {cartesia-1.0.13 → cartesia-1.0.14}/tests/test_deprecated.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.14
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -18,12 +18,12 @@ License-File: LICENSE.md
|
|
18
18
|
# Cartesia Python API Library
|
19
19
|
|
20
20
|

|
21
|
-
[](https://discord.gg/
|
21
|
+
[](https://discord.gg/cartesia)
|
22
22
|
|
23
23
|
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
24
24
|
|
25
25
|
> [!IMPORTANT]
|
26
|
-
> The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/
|
26
|
+
> The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/cartesia) for any support requests!
|
27
27
|
|
28
28
|
- [Cartesia Python API Library](#cartesia-python-api-library)
|
29
29
|
- [Documentation](#documentation)
|
@@ -105,7 +105,7 @@ transcript = "Hello! Welcome to Cartesia"
|
|
105
105
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
106
106
|
model_id = "sonic-english"
|
107
107
|
|
108
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
108
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
109
109
|
output_format = {
|
110
110
|
"container": "raw",
|
111
111
|
"encoding": "pcm_f32le",
|
@@ -156,7 +156,7 @@ async def write_stream():
|
|
156
156
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
157
157
|
model_id = "sonic-english"
|
158
158
|
|
159
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
159
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
160
160
|
output_format = {
|
161
161
|
"container": "raw",
|
162
162
|
"encoding": "pcm_f32le",
|
@@ -211,7 +211,7 @@ transcript = "Hello! Welcome to Cartesia"
|
|
211
211
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
212
212
|
model_id = "sonic-english"
|
213
213
|
|
214
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
214
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
215
215
|
output_format = {
|
216
216
|
"container": "raw",
|
217
217
|
"encoding": "pcm_f32le",
|
@@ -272,7 +272,7 @@ async def send_transcripts(ctx):
|
|
272
272
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
273
273
|
model_id = "sonic-english"
|
274
274
|
|
275
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
275
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
276
276
|
output_format = {
|
277
277
|
"container": "raw",
|
278
278
|
"encoding": "pcm_f32le",
|
@@ -380,7 +380,7 @@ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
|
|
380
380
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
381
381
|
model_id = "sonic-english"
|
382
382
|
|
383
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
383
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
384
384
|
output_format = {
|
385
385
|
"container": "raw",
|
386
386
|
"encoding": "pcm_f32le",
|
@@ -470,7 +470,7 @@ language = "es" # Language code corresponding to the language of the transcript
|
|
470
470
|
# Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
|
471
471
|
model_id = "sonic-multilingual"
|
472
472
|
|
473
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
473
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
474
474
|
output_format = {
|
475
475
|
"container": "raw",
|
476
476
|
"encoding": "pcm_f32le",
|
@@ -623,7 +623,7 @@ display(audio)
|
|
623
623
|
|
624
624
|
#### Output Formats
|
625
625
|
|
626
|
-
You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/
|
626
|
+
You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events).
|
627
627
|
|
628
628
|
The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
|
629
629
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
# Cartesia Python API Library
|
2
2
|
|
3
3
|

|
4
|
-
[](https://discord.gg/
|
4
|
+
[](https://discord.gg/cartesia)
|
5
5
|
|
6
6
|
The official Cartesia Python library which provides convenient access to the Cartesia REST and Websocket API from any Python 3.8+ application.
|
7
7
|
|
8
8
|
> [!IMPORTANT]
|
9
|
-
> The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/
|
9
|
+
> The client library introduces breaking changes in v1.0.0, which was released on June 24th 2024. See the [release notes](https://github.com/cartesia-ai/cartesia-python/releases/tag/v1.0.0) and [migration guide](https://github.com/cartesia-ai/cartesia-python/discussions/44). Reach out to us on [Discord](https://discord.gg/cartesia) for any support requests!
|
10
10
|
|
11
11
|
- [Cartesia Python API Library](#cartesia-python-api-library)
|
12
12
|
- [Documentation](#documentation)
|
@@ -88,7 +88,7 @@ transcript = "Hello! Welcome to Cartesia"
|
|
88
88
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
89
89
|
model_id = "sonic-english"
|
90
90
|
|
91
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
91
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
92
92
|
output_format = {
|
93
93
|
"container": "raw",
|
94
94
|
"encoding": "pcm_f32le",
|
@@ -139,7 +139,7 @@ async def write_stream():
|
|
139
139
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
140
140
|
model_id = "sonic-english"
|
141
141
|
|
142
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
142
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
143
143
|
output_format = {
|
144
144
|
"container": "raw",
|
145
145
|
"encoding": "pcm_f32le",
|
@@ -194,7 +194,7 @@ transcript = "Hello! Welcome to Cartesia"
|
|
194
194
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
195
195
|
model_id = "sonic-english"
|
196
196
|
|
197
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
197
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
198
198
|
output_format = {
|
199
199
|
"container": "raw",
|
200
200
|
"encoding": "pcm_f32le",
|
@@ -255,7 +255,7 @@ async def send_transcripts(ctx):
|
|
255
255
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
256
256
|
model_id = "sonic-english"
|
257
257
|
|
258
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
258
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
259
259
|
output_format = {
|
260
260
|
"container": "raw",
|
261
261
|
"encoding": "pcm_f32le",
|
@@ -363,7 +363,7 @@ voice_id = "87748186-23bb-4158-a1eb-332911b0b708"
|
|
363
363
|
# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
|
364
364
|
model_id = "sonic-english"
|
365
365
|
|
366
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
366
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
367
367
|
output_format = {
|
368
368
|
"container": "raw",
|
369
369
|
"encoding": "pcm_f32le",
|
@@ -453,7 +453,7 @@ language = "es" # Language code corresponding to the language of the transcript
|
|
453
453
|
# Make sure you use the multilingual model! You can check out all models at https://docs.cartesia.ai/getting-started/available-models
|
454
454
|
model_id = "sonic-multilingual"
|
455
455
|
|
456
|
-
# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/
|
456
|
+
# You can find the supported `output_format`s at https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events
|
457
457
|
output_format = {
|
458
458
|
"container": "raw",
|
459
459
|
"encoding": "pcm_f32le",
|
@@ -606,7 +606,7 @@ display(audio)
|
|
606
606
|
|
607
607
|
#### Output Formats
|
608
608
|
|
609
|
-
You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/api-reference/
|
609
|
+
You can use the `client.tts.get_output_format` method to convert string-based output format names into the `output_format` dictionary which is expected by the `output_format` parameter. You can see the `OutputFormatMapping` class in `cartesia._types` for the currently supported output format names. You can also view the currently supported `output_format`s in our [API Reference](https://docs.cartesia.ai/reference/api-reference/rest/stream-speech-server-sent-events).
|
610
610
|
|
611
611
|
The previously used `output_format` strings are now deprecated and will be removed in v1.2.0. These are listed in the `DeprecatedOutputFormatMapping` class in `cartesia._types`.
|
612
612
|
|
@@ -0,0 +1,105 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
|
4
|
+
|
5
|
+
import aiohttp
|
6
|
+
|
7
|
+
from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
|
8
|
+
from cartesia._logger import logger
|
9
|
+
from cartesia._sse import _SSE
|
10
|
+
from cartesia._types import OutputFormat, VoiceControls
|
11
|
+
from cartesia.tts import TTS
|
12
|
+
from cartesia.utils.retry import retry_on_connection_error_async
|
13
|
+
|
14
|
+
|
15
|
+
class _AsyncSSE(_SSE):
|
16
|
+
"""This class contains methods to generate audio using Server-Sent Events asynchronously."""
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
http_url: str,
|
21
|
+
headers: Dict[str, str],
|
22
|
+
timeout: float,
|
23
|
+
get_session: Callable[[], Optional[aiohttp.ClientSession]],
|
24
|
+
):
|
25
|
+
super().__init__(http_url, headers, timeout)
|
26
|
+
self._get_session = get_session
|
27
|
+
|
28
|
+
async def send(
|
29
|
+
self,
|
30
|
+
model_id: str,
|
31
|
+
transcript: str,
|
32
|
+
output_format: OutputFormat,
|
33
|
+
voice_id: Optional[str] = None,
|
34
|
+
voice_embedding: Optional[List[float]] = None,
|
35
|
+
duration: Optional[int] = None,
|
36
|
+
language: Optional[str] = None,
|
37
|
+
stream: bool = True,
|
38
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
39
|
+
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
40
|
+
voice = TTS._validate_and_construct_voice(
|
41
|
+
voice_id,
|
42
|
+
voice_embedding=voice_embedding,
|
43
|
+
experimental_voice_controls=_experimental_voice_controls,
|
44
|
+
)
|
45
|
+
|
46
|
+
request_body = {
|
47
|
+
"model_id": model_id,
|
48
|
+
"transcript": transcript,
|
49
|
+
"voice": voice,
|
50
|
+
"output_format": {
|
51
|
+
"container": output_format["container"],
|
52
|
+
"encoding": output_format["encoding"],
|
53
|
+
"sample_rate": output_format["sample_rate"],
|
54
|
+
},
|
55
|
+
"language": language,
|
56
|
+
}
|
57
|
+
|
58
|
+
if duration is not None:
|
59
|
+
request_body["duration"] = duration
|
60
|
+
|
61
|
+
generator = self._sse_generator_wrapper(request_body)
|
62
|
+
|
63
|
+
if stream:
|
64
|
+
return generator
|
65
|
+
|
66
|
+
chunks = []
|
67
|
+
async for chunk in generator:
|
68
|
+
chunks.append(chunk["audio"])
|
69
|
+
|
70
|
+
return {"audio": b"".join(chunks)}
|
71
|
+
|
72
|
+
@retry_on_connection_error_async(
|
73
|
+
max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
|
74
|
+
)
|
75
|
+
async def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
|
76
|
+
"""Need to wrap the sse generator in a function for the retry decorator to work."""
|
77
|
+
try:
|
78
|
+
async for chunk in self._sse_generator(request_body):
|
79
|
+
yield chunk
|
80
|
+
except Exception as e:
|
81
|
+
raise RuntimeError(f"Error generating audio. {e}")
|
82
|
+
|
83
|
+
async def _sse_generator(self, request_body: Dict[str, Any]):
|
84
|
+
session = await self._get_session()
|
85
|
+
async with session.post(
|
86
|
+
f"{self.http_url}/tts/sse",
|
87
|
+
data=json.dumps(request_body),
|
88
|
+
headers=self.headers,
|
89
|
+
) as response:
|
90
|
+
if not response.ok:
|
91
|
+
raise ValueError(f"Failed to generate audio. {await response.text()}")
|
92
|
+
|
93
|
+
buffer = ""
|
94
|
+
async for chunk_bytes in response.content.iter_any():
|
95
|
+
buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
|
96
|
+
for output in outputs:
|
97
|
+
yield output
|
98
|
+
|
99
|
+
if buffer:
|
100
|
+
try:
|
101
|
+
chunk_json = json.loads(buffer)
|
102
|
+
audio = base64.b64decode(chunk_json["data"])
|
103
|
+
yield {"audio": audio}
|
104
|
+
except json.JSONDecodeError:
|
105
|
+
pass
|
@@ -0,0 +1,323 @@
|
|
1
|
+
import asyncio
|
2
|
+
import uuid
|
3
|
+
from collections import defaultdict
|
4
|
+
from types import TracebackType
|
5
|
+
from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
|
6
|
+
|
7
|
+
import aiohttp
|
8
|
+
|
9
|
+
from cartesia._constants import DEFAULT_MODEL_ID, DEFAULT_VOICE_EMBEDDING
|
10
|
+
from cartesia._types import OutputFormat, VoiceControls
|
11
|
+
from cartesia._websocket import _WebSocket
|
12
|
+
from cartesia.tts import TTS
|
13
|
+
|
14
|
+
|
15
|
+
class _AsyncTTSContext:
|
16
|
+
"""Manage a single context over an AsyncWebSocket.
|
17
|
+
|
18
|
+
This class separates sending requests and receiving responses into two separate methods.
|
19
|
+
This can be used for sending multiple requests without awaiting the response.
|
20
|
+
Then you can listen to the responses in the order they were sent. See README for usage.
|
21
|
+
|
22
|
+
Each AsyncTTSContext will close automatically when a done message is received for that context.
|
23
|
+
This happens when the no_more_inputs method is called (equivalent to sending a request with `continue_ = False`),
|
24
|
+
or if no requests have been sent for 5 seconds on the same context. It also closes if there is an error.
|
25
|
+
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, context_id: str, websocket: "_AsyncWebSocket", timeout: float):
|
29
|
+
self._context_id = context_id
|
30
|
+
self._websocket = websocket
|
31
|
+
self.timeout = timeout
|
32
|
+
self._error = None
|
33
|
+
|
34
|
+
@property
|
35
|
+
def context_id(self) -> str:
|
36
|
+
return self._context_id
|
37
|
+
|
38
|
+
async def send(
|
39
|
+
self,
|
40
|
+
model_id: str,
|
41
|
+
transcript: str,
|
42
|
+
output_format: OutputFormat,
|
43
|
+
voice_id: Optional[str] = None,
|
44
|
+
voice_embedding: Optional[List[float]] = None,
|
45
|
+
context_id: Optional[str] = None,
|
46
|
+
continue_: bool = False,
|
47
|
+
duration: Optional[int] = None,
|
48
|
+
language: Optional[str] = None,
|
49
|
+
add_timestamps: bool = False,
|
50
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
51
|
+
) -> None:
|
52
|
+
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
model_id: The ID of the model to use for generating audio.
|
56
|
+
transcript: The text to convert to speech.
|
57
|
+
output_format: A dictionary containing the details of the output format.
|
58
|
+
voice_id: The ID of the voice to use for generating audio.
|
59
|
+
voice_embedding: The embedding of the voice to use for generating audio.
|
60
|
+
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
61
|
+
continue_: Whether to continue the audio generation from the previous transcript or not.
|
62
|
+
duration: The duration of the audio in seconds.
|
63
|
+
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
|
64
|
+
add_timestamps: Whether to return word-level timestamps.
|
65
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
66
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
None.
|
70
|
+
"""
|
71
|
+
if context_id is not None and context_id != self._context_id:
|
72
|
+
raise ValueError("Context ID does not match the context ID of the current context.")
|
73
|
+
if continue_ and transcript == "":
|
74
|
+
raise ValueError("Transcript cannot be empty when continue_ is True.")
|
75
|
+
|
76
|
+
await self._websocket.connect()
|
77
|
+
|
78
|
+
voice = TTS._validate_and_construct_voice(
|
79
|
+
voice_id,
|
80
|
+
voice_embedding,
|
81
|
+
experimental_voice_controls=_experimental_voice_controls,
|
82
|
+
)
|
83
|
+
|
84
|
+
request_body = {
|
85
|
+
"model_id": model_id,
|
86
|
+
"transcript": transcript,
|
87
|
+
"voice": voice,
|
88
|
+
"output_format": {
|
89
|
+
"container": output_format["container"],
|
90
|
+
"encoding": output_format["encoding"],
|
91
|
+
"sample_rate": output_format["sample_rate"],
|
92
|
+
},
|
93
|
+
"context_id": self._context_id,
|
94
|
+
"continue": continue_,
|
95
|
+
"language": language,
|
96
|
+
"add_timestamps": add_timestamps,
|
97
|
+
}
|
98
|
+
|
99
|
+
if duration is not None:
|
100
|
+
request_body["duration"] = duration
|
101
|
+
|
102
|
+
await self._websocket.websocket.send_json(request_body)
|
103
|
+
|
104
|
+
# Start listening for responses on the WebSocket
|
105
|
+
self._websocket._dispatch_listener()
|
106
|
+
|
107
|
+
async def no_more_inputs(self) -> None:
|
108
|
+
"""Send a request to the WebSocket to indicate that no more requests will be sent."""
|
109
|
+
await self.send(
|
110
|
+
model_id=DEFAULT_MODEL_ID,
|
111
|
+
transcript="",
|
112
|
+
output_format=TTS.get_output_format("raw_pcm_f32le_44100"),
|
113
|
+
voice_embedding=DEFAULT_VOICE_EMBEDDING, # Default voice embedding since it's a required input for now.
|
114
|
+
context_id=self._context_id,
|
115
|
+
continue_=False,
|
116
|
+
)
|
117
|
+
|
118
|
+
async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:
|
119
|
+
"""Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
An async generator that yields audio chunks. Each chunk is a dictionary containing the audio as bytes.
|
123
|
+
"""
|
124
|
+
try:
|
125
|
+
while True:
|
126
|
+
response = await self._websocket._get_message(
|
127
|
+
self._context_id, timeout=self.timeout
|
128
|
+
)
|
129
|
+
if "error" in response:
|
130
|
+
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
131
|
+
if response["done"]:
|
132
|
+
break
|
133
|
+
yield self._websocket._convert_response(response, include_context_id=True)
|
134
|
+
except Exception as e:
|
135
|
+
if isinstance(e, asyncio.TimeoutError):
|
136
|
+
raise RuntimeError("Timeout while waiting for audio chunk")
|
137
|
+
raise RuntimeError(f"Failed to generate audio:\n{e}")
|
138
|
+
finally:
|
139
|
+
self._close()
|
140
|
+
|
141
|
+
def _close(self) -> None:
|
142
|
+
"""Closes the context. Automatically called when a done message is received for this context."""
|
143
|
+
self._websocket._remove_context(self._context_id)
|
144
|
+
|
145
|
+
def is_closed(self):
|
146
|
+
"""Check if the context is closed or not. Returns True if closed."""
|
147
|
+
return self._context_id not in self._websocket._context_queues
|
148
|
+
|
149
|
+
async def __aenter__(self):
|
150
|
+
return self
|
151
|
+
|
152
|
+
async def __aexit__(
|
153
|
+
self,
|
154
|
+
exc_type: Union[type, None],
|
155
|
+
exc: Union[BaseException, None],
|
156
|
+
exc_tb: Union[TracebackType, None],
|
157
|
+
):
|
158
|
+
self._close()
|
159
|
+
|
160
|
+
def __del__(self):
|
161
|
+
self._close()
|
162
|
+
|
163
|
+
|
164
|
+
class _AsyncWebSocket(_WebSocket):
|
165
|
+
"""This class contains methods to generate audio using WebSocket asynchronously."""
|
166
|
+
|
167
|
+
def __init__(
|
168
|
+
self,
|
169
|
+
ws_url: str,
|
170
|
+
api_key: str,
|
171
|
+
cartesia_version: str,
|
172
|
+
timeout: float,
|
173
|
+
get_session: Callable[[], Optional[aiohttp.ClientSession]],
|
174
|
+
):
|
175
|
+
"""
|
176
|
+
Args:
|
177
|
+
ws_url: The WebSocket URL for the Cartesia API.
|
178
|
+
api_key: The API key to use for authorization.
|
179
|
+
cartesia_version: The version of the Cartesia API to use.
|
180
|
+
timeout: The timeout for responses on the WebSocket in seconds.
|
181
|
+
get_session: A function that returns an aiohttp.ClientSession object.
|
182
|
+
"""
|
183
|
+
super().__init__(ws_url, api_key, cartesia_version)
|
184
|
+
self.timeout = timeout
|
185
|
+
self._get_session = get_session
|
186
|
+
self.websocket = None
|
187
|
+
self._context_queues: Dict[str, asyncio.Queue] = {}
|
188
|
+
self._processing_task: asyncio.Task = None
|
189
|
+
|
190
|
+
def __del__(self):
|
191
|
+
try:
|
192
|
+
loop = asyncio.get_running_loop()
|
193
|
+
except RuntimeError:
|
194
|
+
loop = None
|
195
|
+
|
196
|
+
if loop is None:
|
197
|
+
asyncio.run(self.close())
|
198
|
+
elif loop.is_running():
|
199
|
+
loop.create_task(self.close())
|
200
|
+
|
201
|
+
async def connect(self):
|
202
|
+
if self.websocket is None or self._is_websocket_closed():
|
203
|
+
route = "tts/websocket"
|
204
|
+
session = await self._get_session()
|
205
|
+
try:
|
206
|
+
self.websocket = await session.ws_connect(
|
207
|
+
f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
|
208
|
+
)
|
209
|
+
except Exception as e:
|
210
|
+
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
211
|
+
|
212
|
+
def _is_websocket_closed(self):
|
213
|
+
return self.websocket.closed
|
214
|
+
|
215
|
+
async def close(self):
|
216
|
+
"""This method closes the websocket connection. *Highly* recommended to call this method when done."""
|
217
|
+
if self.websocket is not None and not self._is_websocket_closed():
|
218
|
+
await self.websocket.close()
|
219
|
+
if self._processing_task:
|
220
|
+
self._processing_task.cancel()
|
221
|
+
try:
|
222
|
+
self._processing_task = None
|
223
|
+
except asyncio.CancelledError:
|
224
|
+
pass
|
225
|
+
except TypeError as e:
|
226
|
+
# Ignore the error if the task is already cancelled
|
227
|
+
# For some reason we are getting None responses
|
228
|
+
# TODO: This needs to be fixed - we need to think about why we are getting None responses.
|
229
|
+
if "Received message 256:None" not in str(e):
|
230
|
+
raise e
|
231
|
+
|
232
|
+
for context_id in list(self._context_queues.keys()):
|
233
|
+
self._remove_context(context_id)
|
234
|
+
|
235
|
+
self._context_queues.clear()
|
236
|
+
self._processing_task = None
|
237
|
+
self.websocket = None
|
238
|
+
|
239
|
+
async def send(
|
240
|
+
self,
|
241
|
+
model_id: str,
|
242
|
+
transcript: str,
|
243
|
+
output_format: OutputFormat,
|
244
|
+
voice_id: Optional[str] = None,
|
245
|
+
voice_embedding: Optional[List[float]] = None,
|
246
|
+
context_id: Optional[str] = None,
|
247
|
+
duration: Optional[int] = None,
|
248
|
+
language: Optional[str] = None,
|
249
|
+
stream: bool = True,
|
250
|
+
add_timestamps: bool = False,
|
251
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
252
|
+
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
253
|
+
"""See :meth:`_WebSocket.send` for details."""
|
254
|
+
if context_id is None:
|
255
|
+
context_id = str(uuid.uuid4())
|
256
|
+
|
257
|
+
ctx = self.context(context_id)
|
258
|
+
|
259
|
+
await ctx.send(
|
260
|
+
model_id=model_id,
|
261
|
+
transcript=transcript,
|
262
|
+
output_format=output_format,
|
263
|
+
voice_id=voice_id,
|
264
|
+
voice_embedding=voice_embedding,
|
265
|
+
context_id=context_id,
|
266
|
+
duration=duration,
|
267
|
+
language=language,
|
268
|
+
continue_=False,
|
269
|
+
add_timestamps=add_timestamps,
|
270
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
271
|
+
)
|
272
|
+
|
273
|
+
generator = ctx.receive()
|
274
|
+
|
275
|
+
if stream:
|
276
|
+
return generator
|
277
|
+
|
278
|
+
chunks = []
|
279
|
+
word_timestamps = defaultdict(list)
|
280
|
+
async for chunk in generator:
|
281
|
+
if "audio" in chunk:
|
282
|
+
chunks.append(chunk["audio"])
|
283
|
+
if add_timestamps and "word_timestamps" in chunk:
|
284
|
+
for k, v in chunk["word_timestamps"].items():
|
285
|
+
word_timestamps[k].extend(v)
|
286
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
287
|
+
if add_timestamps:
|
288
|
+
out["word_timestamps"] = word_timestamps
|
289
|
+
return out
|
290
|
+
|
291
|
+
async def _process_responses(self):
|
292
|
+
try:
|
293
|
+
while True:
|
294
|
+
response = await self.websocket.receive_json()
|
295
|
+
if response["context_id"]:
|
296
|
+
context_id = response["context_id"]
|
297
|
+
if context_id in self._context_queues:
|
298
|
+
await self._context_queues[context_id].put(response)
|
299
|
+
except Exception as e:
|
300
|
+
self._error = e
|
301
|
+
raise e
|
302
|
+
|
303
|
+
async def _get_message(self, context_id: str, timeout: float) -> Dict[str, Any]:
|
304
|
+
if context_id not in self._context_queues:
|
305
|
+
raise ValueError(f"Context ID {context_id} not found.")
|
306
|
+
return await asyncio.wait_for(self._context_queues[context_id].get(), timeout=timeout)
|
307
|
+
|
308
|
+
def _remove_context(self, context_id: str):
|
309
|
+
if context_id in self._context_queues:
|
310
|
+
del self._context_queues[context_id]
|
311
|
+
|
312
|
+
def _dispatch_listener(self):
|
313
|
+
if self._processing_task is None or self._processing_task.done():
|
314
|
+
self._processing_task = asyncio.create_task(self._process_responses())
|
315
|
+
|
316
|
+
def context(self, context_id: Optional[str] = None) -> _AsyncTTSContext:
|
317
|
+
if context_id in self._context_queues:
|
318
|
+
raise ValueError(f"AsyncContext for context ID {context_id} already exists.")
|
319
|
+
if context_id is None:
|
320
|
+
context_id = str(uuid.uuid4())
|
321
|
+
if context_id not in self._context_queues:
|
322
|
+
self._context_queues[context_id] = asyncio.Queue()
|
323
|
+
return _AsyncTTSContext(context_id, self, self.timeout)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
DEFAULT_MODEL_ID = "sonic-english" # latest default model
|
2
|
+
MULTILINGUAL_MODEL_ID = "sonic-multilingual" # latest multilingual model
|
3
|
+
DEFAULT_BASE_URL = "api.cartesia.ai"
|
4
|
+
DEFAULT_CARTESIA_VERSION = "2024-06-10" # latest version
|
5
|
+
DEFAULT_TIMEOUT = 30 # seconds
|
6
|
+
DEFAULT_NUM_CONNECTIONS = 10 # connections per client
|
7
|
+
DEFAULT_VOICE_EMBEDDING = [1.0] * 192
|
8
|
+
|
9
|
+
BACKOFF_FACTOR = 1
|
10
|
+
MAX_RETRIES = 3
|