cartesia 1.4.0__py3-none-any.whl → 2.0.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +288 -3
- cartesia/api_status/__init__.py +6 -0
- cartesia/api_status/client.py +104 -0
- cartesia/api_status/requests/__init__.py +5 -0
- cartesia/api_status/requests/api_info.py +8 -0
- cartesia/api_status/types/__init__.py +5 -0
- cartesia/api_status/types/api_info.py +20 -0
- cartesia/base_client.py +160 -0
- cartesia/client.py +163 -40
- cartesia/core/__init__.py +47 -0
- cartesia/core/api_error.py +15 -0
- cartesia/core/client_wrapper.py +55 -0
- cartesia/core/datetime_utils.py +28 -0
- cartesia/core/file.py +67 -0
- cartesia/core/http_client.py +499 -0
- cartesia/core/jsonable_encoder.py +101 -0
- cartesia/core/pydantic_utilities.py +296 -0
- cartesia/core/query_encoder.py +58 -0
- cartesia/core/remove_none_from_dict.py +11 -0
- cartesia/core/request_options.py +35 -0
- cartesia/core/serialization.py +272 -0
- cartesia/datasets/__init__.py +24 -0
- cartesia/datasets/client.py +422 -0
- cartesia/datasets/requests/__init__.py +15 -0
- cartesia/datasets/requests/create_dataset_request.py +7 -0
- cartesia/datasets/requests/dataset.py +9 -0
- cartesia/datasets/requests/dataset_file.py +9 -0
- cartesia/datasets/requests/paginated_dataset_files.py +10 -0
- cartesia/datasets/requests/paginated_datasets.py +10 -0
- cartesia/datasets/types/__init__.py +17 -0
- cartesia/datasets/types/create_dataset_request.py +19 -0
- cartesia/datasets/types/dataset.py +21 -0
- cartesia/datasets/types/dataset_file.py +21 -0
- cartesia/datasets/types/file_purpose.py +5 -0
- cartesia/datasets/types/paginated_dataset_files.py +21 -0
- cartesia/datasets/types/paginated_datasets.py +21 -0
- cartesia/embedding/__init__.py +5 -0
- cartesia/embedding/types/__init__.py +5 -0
- cartesia/embedding/types/embedding.py +201 -0
- cartesia/environment.py +7 -0
- cartesia/infill/__init__.py +2 -0
- cartesia/infill/client.py +294 -0
- cartesia/tts/__init__.py +167 -0
- cartesia/{_async_websocket.py → tts/_async_websocket.py} +159 -84
- cartesia/tts/_websocket.py +430 -0
- cartesia/tts/client.py +407 -0
- cartesia/tts/requests/__init__.py +76 -0
- cartesia/tts/requests/cancel_context_request.py +17 -0
- cartesia/tts/requests/controls.py +11 -0
- cartesia/tts/requests/generation_request.py +53 -0
- cartesia/tts/requests/mp_3_output_format.py +11 -0
- cartesia/tts/requests/output_format.py +30 -0
- cartesia/tts/requests/phoneme_timestamps.py +10 -0
- cartesia/tts/requests/raw_output_format.py +11 -0
- cartesia/tts/requests/speed.py +7 -0
- cartesia/tts/requests/tts_request.py +24 -0
- cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
- cartesia/tts/requests/tts_request_id_specifier.py +16 -0
- cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
- cartesia/tts/requests/wav_output_format.py +7 -0
- cartesia/tts/requests/web_socket_base_response.py +11 -0
- cartesia/tts/requests/web_socket_chunk_response.py +8 -0
- cartesia/tts/requests/web_socket_done_response.py +7 -0
- cartesia/tts/requests/web_socket_error_response.py +7 -0
- cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
- cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
- cartesia/tts/requests/web_socket_request.py +7 -0
- cartesia/tts/requests/web_socket_response.py +69 -0
- cartesia/tts/requests/web_socket_stream_options.py +8 -0
- cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_tts_output.py +18 -0
- cartesia/tts/requests/web_socket_tts_request.py +24 -0
- cartesia/tts/requests/word_timestamps.py +10 -0
- cartesia/tts/socket_client.py +302 -0
- cartesia/tts/types/__init__.py +90 -0
- cartesia/tts/types/cancel_context_request.py +28 -0
- cartesia/tts/types/context_id.py +3 -0
- cartesia/tts/types/controls.py +22 -0
- cartesia/tts/types/emotion.py +29 -0
- cartesia/tts/types/flush_id.py +3 -0
- cartesia/tts/types/generation_request.py +66 -0
- cartesia/tts/types/mp_3_output_format.py +23 -0
- cartesia/tts/types/natural_specifier.py +5 -0
- cartesia/tts/types/numerical_specifier.py +3 -0
- cartesia/tts/types/output_format.py +58 -0
- cartesia/tts/types/phoneme_timestamps.py +21 -0
- cartesia/tts/types/raw_encoding.py +5 -0
- cartesia/tts/types/raw_output_format.py +22 -0
- cartesia/tts/types/speed.py +7 -0
- cartesia/tts/types/supported_language.py +7 -0
- cartesia/tts/types/tts_request.py +35 -0
- cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
- cartesia/tts/types/tts_request_id_specifier.py +27 -0
- cartesia/tts/types/tts_request_voice_specifier.py +7 -0
- cartesia/tts/types/wav_output_format.py +17 -0
- cartesia/tts/types/web_socket_base_response.py +22 -0
- cartesia/tts/types/web_socket_chunk_response.py +20 -0
- cartesia/tts/types/web_socket_done_response.py +17 -0
- cartesia/tts/types/web_socket_error_response.py +19 -0
- cartesia/tts/types/web_socket_flush_done_response.py +21 -0
- cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_raw_output_format.py +22 -0
- cartesia/tts/types/web_socket_request.py +7 -0
- cartesia/tts/types/web_socket_response.py +124 -0
- cartesia/tts/types/web_socket_stream_options.py +19 -0
- cartesia/tts/types/web_socket_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_tts_output.py +27 -0
- cartesia/tts/types/web_socket_tts_request.py +36 -0
- cartesia/tts/types/word_timestamps.py +21 -0
- cartesia/tts/utils/tts.py +64 -0
- cartesia/tts/utils/types.py +70 -0
- cartesia/version.py +3 -1
- cartesia/voice_changer/__init__.py +27 -0
- cartesia/voice_changer/client.py +395 -0
- cartesia/voice_changer/requests/__init__.py +15 -0
- cartesia/voice_changer/requests/streaming_response.py +36 -0
- cartesia/voice_changer/types/__init__.py +17 -0
- cartesia/voice_changer/types/output_format_container.py +5 -0
- cartesia/voice_changer/types/streaming_response.py +62 -0
- cartesia/voices/__init__.py +67 -0
- cartesia/voices/client.py +1812 -0
- cartesia/voices/requests/__init__.py +27 -0
- cartesia/voices/requests/create_voice_request.py +21 -0
- cartesia/voices/requests/embedding_response.py +8 -0
- cartesia/voices/requests/embedding_specifier.py +10 -0
- cartesia/voices/requests/id_specifier.py +10 -0
- cartesia/voices/requests/localize_dialect.py +6 -0
- cartesia/voices/requests/localize_voice_request.py +15 -0
- cartesia/voices/requests/mix_voice_specifier.py +7 -0
- cartesia/voices/requests/mix_voices_request.py +9 -0
- cartesia/voices/requests/update_voice_request.py +15 -0
- cartesia/voices/requests/voice.py +39 -0
- cartesia/voices/requests/voice_metadata.py +36 -0
- cartesia/voices/types/__init__.py +41 -0
- cartesia/voices/types/base_voice_id.py +5 -0
- cartesia/voices/types/clone_mode.py +5 -0
- cartesia/voices/types/create_voice_request.py +32 -0
- cartesia/voices/types/embedding_response.py +20 -0
- cartesia/voices/types/embedding_specifier.py +22 -0
- cartesia/voices/types/gender.py +5 -0
- cartesia/voices/types/id_specifier.py +22 -0
- cartesia/voices/types/localize_dialect.py +6 -0
- cartesia/voices/types/localize_english_dialect.py +5 -0
- cartesia/voices/types/localize_target_language.py +7 -0
- cartesia/voices/types/localize_voice_request.py +26 -0
- cartesia/voices/types/mix_voice_specifier.py +7 -0
- cartesia/voices/types/mix_voices_request.py +20 -0
- cartesia/voices/types/update_voice_request.py +27 -0
- cartesia/voices/types/voice.py +50 -0
- cartesia/voices/types/voice_id.py +3 -0
- cartesia/voices/types/voice_metadata.py +48 -0
- cartesia/voices/types/weight.py +3 -0
- cartesia-2.0.0a0.dist-info/METADATA +306 -0
- cartesia-2.0.0a0.dist-info/RECORD +158 -0
- {cartesia-1.4.0.dist-info → cartesia-2.0.0a0.dist-info}/WHEEL +1 -1
- cartesia/_async_sse.py +0 -95
- cartesia/_logger.py +0 -3
- cartesia/_sse.py +0 -143
- cartesia/_types.py +0 -70
- cartesia/_websocket.py +0 -358
- cartesia/async_client.py +0 -82
- cartesia/async_tts.py +0 -176
- cartesia/resource.py +0 -44
- cartesia/tts.py +0 -292
- cartesia/utils/deprecated.py +0 -55
- cartesia/utils/retry.py +0 -87
- cartesia/utils/tts.py +0 -78
- cartesia/voices.py +0 -204
- cartesia-1.4.0.dist-info/METADATA +0 -663
- cartesia-1.4.0.dist-info/RECORD +0 -23
- cartesia-1.4.0.dist-info/licenses/LICENSE.md +0 -21
- /cartesia/{utils/__init__.py → py.typed} +0 -0
- /cartesia/{_constants.py → tts/utils/constants.py} +0 -0
cartesia/_sse.py
DELETED
@@ -1,143 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import json
|
3
|
-
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
|
4
|
-
|
5
|
-
import requests
|
6
|
-
|
7
|
-
from cartesia._constants import BACKOFF_FACTOR, MAX_RETRIES
|
8
|
-
from cartesia._logger import logger
|
9
|
-
from cartesia._types import OutputFormat, VoiceControls
|
10
|
-
from cartesia.utils.retry import retry_on_connection_error
|
11
|
-
from cartesia.utils.tts import _construct_tts_request, _validate_and_construct_voice
|
12
|
-
|
13
|
-
|
14
|
-
class _SSE:
|
15
|
-
"""This class contains methods to generate audio using Server-Sent Events.
|
16
|
-
|
17
|
-
Usage:
|
18
|
-
>>> for audio_chunk in client.tts.sse(
|
19
|
-
... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
|
20
|
-
... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}, stream=True
|
21
|
-
... ):
|
22
|
-
... audio = audio_chunk["audio"]
|
23
|
-
"""
|
24
|
-
|
25
|
-
def __init__(
|
26
|
-
self,
|
27
|
-
http_url: str,
|
28
|
-
headers: Dict[str, str],
|
29
|
-
timeout: float,
|
30
|
-
):
|
31
|
-
self.http_url = http_url
|
32
|
-
self.headers = headers
|
33
|
-
self.timeout = timeout
|
34
|
-
|
35
|
-
def _update_buffer(self, buffer: str, chunk_bytes: bytes) -> Tuple[str, List[Dict[str, Any]]]:
|
36
|
-
buffer += chunk_bytes.decode("utf-8")
|
37
|
-
outputs = []
|
38
|
-
while "{" in buffer and "}" in buffer:
|
39
|
-
start_index = buffer.find("{")
|
40
|
-
end_index = buffer.find("}", start_index)
|
41
|
-
if start_index != -1 and end_index != -1:
|
42
|
-
try:
|
43
|
-
chunk_json = json.loads(buffer[start_index : end_index + 1])
|
44
|
-
if "error" in chunk_json:
|
45
|
-
raise RuntimeError(f"Error generating audio:\n{chunk_json['error']}")
|
46
|
-
if chunk_json["done"]:
|
47
|
-
break
|
48
|
-
audio = base64.b64decode(chunk_json["data"])
|
49
|
-
outputs.append({"audio": audio})
|
50
|
-
buffer = buffer[end_index + 1 :]
|
51
|
-
except json.JSONDecodeError:
|
52
|
-
break
|
53
|
-
return buffer, outputs
|
54
|
-
|
55
|
-
def send(
|
56
|
-
self,
|
57
|
-
model_id: str,
|
58
|
-
transcript: str,
|
59
|
-
output_format: OutputFormat,
|
60
|
-
voice_id: Optional[str] = None,
|
61
|
-
voice_embedding: Optional[List[float]] = None,
|
62
|
-
duration: Optional[int] = None,
|
63
|
-
language: Optional[str] = None,
|
64
|
-
stream: bool = True,
|
65
|
-
_experimental_voice_controls: Optional[VoiceControls] = None,
|
66
|
-
) -> Union[bytes, Generator[bytes, None, None]]:
|
67
|
-
"""Send a request to the server to generate audio using Server-Sent Events.
|
68
|
-
|
69
|
-
Args:
|
70
|
-
model_id: The ID of the model to use for generating audio.
|
71
|
-
transcript: The text to convert to speech.
|
72
|
-
voice_id: The ID of the voice to use for generating audio.
|
73
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
74
|
-
output_format: A dictionary containing the details of the output format.
|
75
|
-
duration: The duration of the audio in seconds.
|
76
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
77
|
-
stream: Whether to stream the audio or not.
|
78
|
-
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
79
|
-
Note: This is an experimental feature and may change rapidly in future releases.
|
80
|
-
|
81
|
-
Returns:
|
82
|
-
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
83
|
-
If `stream` is False, the method returns a dictionary.
|
84
|
-
Both the generator and the dictionary contain the following key(s):
|
85
|
-
- audio: The audio as bytes.
|
86
|
-
"""
|
87
|
-
request_body = _construct_tts_request(
|
88
|
-
model_id=model_id,
|
89
|
-
transcript=transcript,
|
90
|
-
output_format=output_format,
|
91
|
-
voice_id=voice_id,
|
92
|
-
voice_embedding=voice_embedding,
|
93
|
-
duration=duration,
|
94
|
-
language=language,
|
95
|
-
_experimental_voice_controls=_experimental_voice_controls,
|
96
|
-
)
|
97
|
-
|
98
|
-
generator = self._sse_generator_wrapper(request_body)
|
99
|
-
|
100
|
-
if stream:
|
101
|
-
return generator
|
102
|
-
|
103
|
-
chunks = []
|
104
|
-
for chunk in generator:
|
105
|
-
chunks.append(chunk["audio"])
|
106
|
-
|
107
|
-
return {"audio": b"".join(chunks)}
|
108
|
-
|
109
|
-
@retry_on_connection_error(
|
110
|
-
max_retries=MAX_RETRIES, backoff_factor=BACKOFF_FACTOR, logger=logger
|
111
|
-
)
|
112
|
-
def _sse_generator_wrapper(self, request_body: Dict[str, Any]):
|
113
|
-
"""Need to wrap the sse generator in a function for the retry decorator to work."""
|
114
|
-
try:
|
115
|
-
for chunk in self._sse_generator(request_body):
|
116
|
-
yield chunk
|
117
|
-
except Exception as e:
|
118
|
-
raise RuntimeError(f"Error generating audio. {e}")
|
119
|
-
|
120
|
-
def _sse_generator(self, request_body: Dict[str, Any]):
|
121
|
-
response = requests.post(
|
122
|
-
f"{self.http_url}/tts/sse",
|
123
|
-
stream=True,
|
124
|
-
data=json.dumps(request_body),
|
125
|
-
headers=self.headers,
|
126
|
-
timeout=(self.timeout, self.timeout),
|
127
|
-
)
|
128
|
-
if not response.ok:
|
129
|
-
raise ValueError(f"Failed to generate audio. {response.text}")
|
130
|
-
|
131
|
-
buffer = ""
|
132
|
-
for chunk_bytes in response.iter_content(chunk_size=None):
|
133
|
-
buffer, outputs = self._update_buffer(buffer=buffer, chunk_bytes=chunk_bytes)
|
134
|
-
for output in outputs:
|
135
|
-
yield output
|
136
|
-
|
137
|
-
if buffer:
|
138
|
-
try:
|
139
|
-
chunk_json = json.loads(buffer)
|
140
|
-
audio = base64.b64decode(chunk_json["data"])
|
141
|
-
yield {"audio": audio}
|
142
|
-
except json.JSONDecodeError:
|
143
|
-
pass
|
cartesia/_types.py
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
from typing import List, Optional, TypedDict, Union
|
2
|
-
|
3
|
-
from cartesia.utils.deprecated import deprecated
|
4
|
-
|
5
|
-
|
6
|
-
class OutputFormatMapping:
|
7
|
-
_format_mapping = {
|
8
|
-
"raw_pcm_f32le_44100": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
|
9
|
-
"raw_pcm_s16le_44100": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 44100},
|
10
|
-
"raw_pcm_f32le_24000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 24000},
|
11
|
-
"raw_pcm_s16le_24000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 24000},
|
12
|
-
"raw_pcm_f32le_22050": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 22050},
|
13
|
-
"raw_pcm_s16le_22050": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 22050},
|
14
|
-
"raw_pcm_f32le_16000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 16000},
|
15
|
-
"raw_pcm_s16le_16000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 16000},
|
16
|
-
"raw_pcm_f32le_8000": {"container": "raw", "encoding": "pcm_f32le", "sample_rate": 8000},
|
17
|
-
"raw_pcm_s16le_8000": {"container": "raw", "encoding": "pcm_s16le", "sample_rate": 8000},
|
18
|
-
"raw_pcm_mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
|
19
|
-
"raw_pcm_alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
|
20
|
-
}
|
21
|
-
|
22
|
-
@classmethod
|
23
|
-
def get_format(cls, format_name):
|
24
|
-
if format_name in cls._format_mapping:
|
25
|
-
return cls._format_mapping[format_name]
|
26
|
-
else:
|
27
|
-
raise ValueError(f"Unsupported format: {format_name}")
|
28
|
-
|
29
|
-
|
30
|
-
class VoiceMetadata(TypedDict):
|
31
|
-
id: str
|
32
|
-
name: str
|
33
|
-
description: str
|
34
|
-
embedding: List[float]
|
35
|
-
is_public: bool
|
36
|
-
user_id: str
|
37
|
-
created_at: str
|
38
|
-
language: str
|
39
|
-
|
40
|
-
|
41
|
-
class VoiceControls(TypedDict):
|
42
|
-
"""Defines different voice control parameters for voice synthesis.
|
43
|
-
|
44
|
-
For a complete list of supported parameters, refer to the Cartesia API documentation.
|
45
|
-
https://docs.cartesia.ai/reference/api-reference
|
46
|
-
|
47
|
-
Examples:
|
48
|
-
>>> {"speed": "fastest"}
|
49
|
-
>>> {"speed": "slow", "emotion": ["sadness:high"]}
|
50
|
-
>>> {"emotion": ["surprise:highest", "curiosity"]}
|
51
|
-
|
52
|
-
Note:
|
53
|
-
This is an experimental class and is subject to rapid change in future versions.
|
54
|
-
"""
|
55
|
-
|
56
|
-
speed: Union[str, float] = ""
|
57
|
-
emotion: List[str] = []
|
58
|
-
|
59
|
-
|
60
|
-
class OutputFormat(TypedDict):
|
61
|
-
container: str
|
62
|
-
encoding: str
|
63
|
-
sample_rate: int
|
64
|
-
bit_rate: Optional[int] = None
|
65
|
-
|
66
|
-
|
67
|
-
class EventType:
|
68
|
-
NULL = ""
|
69
|
-
AUDIO = "chunk"
|
70
|
-
TIMESTAMPS = "timestamps"
|
cartesia/_websocket.py
DELETED
@@ -1,358 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import json
|
3
|
-
import uuid
|
4
|
-
from collections import defaultdict
|
5
|
-
from typing import Any, Dict, Generator, Iterator, List, Optional, Set, Union
|
6
|
-
|
7
|
-
try:
|
8
|
-
from websockets.sync.client import connect
|
9
|
-
|
10
|
-
IS_WEBSOCKET_SYNC_AVAILABLE = True
|
11
|
-
except ImportError:
|
12
|
-
IS_WEBSOCKET_SYNC_AVAILABLE = False
|
13
|
-
|
14
|
-
from iterators import TimeoutIterator
|
15
|
-
|
16
|
-
from cartesia._types import EventType, OutputFormat, VoiceControls
|
17
|
-
from cartesia.utils.tts import _construct_tts_request
|
18
|
-
|
19
|
-
|
20
|
-
class _TTSContext:
|
21
|
-
"""Manage a single context over a WebSocket.
|
22
|
-
|
23
|
-
This class can be used to stream inputs, as they become available, to a specific `context_id`. See README for usage.
|
24
|
-
|
25
|
-
See :class:`_AsyncTTSContext` for asynchronous use cases.
|
26
|
-
|
27
|
-
Each TTSContext will close automatically when a done message is received for that context. It also closes if there is an error.
|
28
|
-
"""
|
29
|
-
|
30
|
-
def __init__(self, context_id: str, websocket: "_WebSocket"):
|
31
|
-
self._context_id = context_id
|
32
|
-
self._websocket = websocket
|
33
|
-
self._error = None
|
34
|
-
|
35
|
-
def __del__(self):
|
36
|
-
self._close()
|
37
|
-
|
38
|
-
@property
|
39
|
-
def context_id(self) -> str:
|
40
|
-
return self._context_id
|
41
|
-
|
42
|
-
def send(
|
43
|
-
self,
|
44
|
-
model_id: str,
|
45
|
-
transcript: Iterator[str],
|
46
|
-
output_format: OutputFormat,
|
47
|
-
voice_id: Optional[str] = None,
|
48
|
-
voice_embedding: Optional[List[float]] = None,
|
49
|
-
context_id: Optional[str] = None,
|
50
|
-
duration: Optional[int] = None,
|
51
|
-
language: Optional[str] = None,
|
52
|
-
add_timestamps: bool = False,
|
53
|
-
_experimental_voice_controls: Optional[VoiceControls] = None,
|
54
|
-
) -> Generator[bytes, None, None]:
|
55
|
-
"""Send audio generation requests to the WebSocket and yield responses.
|
56
|
-
|
57
|
-
Args:
|
58
|
-
model_id: The ID of the model to use for generating audio.
|
59
|
-
transcript: Iterator over text chunks with <1s latency.
|
60
|
-
output_format: A dictionary containing the details of the output format.
|
61
|
-
voice_id: The ID of the voice to use for generating audio.
|
62
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
63
|
-
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
64
|
-
duration: The duration of the audio in seconds.
|
65
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
66
|
-
add_timestamps: Whether to return word-level timestamps.
|
67
|
-
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
68
|
-
Note: This is an experimental feature and may change rapidly in future releases.
|
69
|
-
|
70
|
-
Yields:
|
71
|
-
Dictionary containing the following key(s):
|
72
|
-
- audio: The audio as bytes.
|
73
|
-
- context_id: The context ID for the request.
|
74
|
-
|
75
|
-
Raises:
|
76
|
-
ValueError: If provided context_id doesn't match the current context.
|
77
|
-
RuntimeError: If there's an error generating audio.
|
78
|
-
"""
|
79
|
-
if context_id is not None and context_id != self._context_id:
|
80
|
-
raise ValueError("Context ID does not match the context ID of the current context.")
|
81
|
-
|
82
|
-
self._websocket.connect()
|
83
|
-
|
84
|
-
# Create the initial request body
|
85
|
-
request_body = _construct_tts_request(
|
86
|
-
model_id=model_id,
|
87
|
-
transcript=transcript,
|
88
|
-
output_format=output_format,
|
89
|
-
voice_id=voice_id,
|
90
|
-
voice_embedding=voice_embedding,
|
91
|
-
duration=duration,
|
92
|
-
language=language,
|
93
|
-
context_id=self._context_id,
|
94
|
-
add_timestamps=add_timestamps,
|
95
|
-
_experimental_voice_controls=_experimental_voice_controls,
|
96
|
-
)
|
97
|
-
|
98
|
-
try:
|
99
|
-
# Create an iterator with a timeout to get text chunks
|
100
|
-
text_iterator = TimeoutIterator(
|
101
|
-
transcript, timeout=0.001
|
102
|
-
) # 1ms timeout for nearly non-blocking receive
|
103
|
-
next_chunk = next(text_iterator, None)
|
104
|
-
|
105
|
-
while True:
|
106
|
-
# Send the next text chunk to the WebSocket if available
|
107
|
-
if next_chunk is not None and next_chunk != text_iterator.get_sentinel():
|
108
|
-
request_body["transcript"] = next_chunk
|
109
|
-
request_body["continue"] = True
|
110
|
-
self._websocket.websocket.send(json.dumps(request_body))
|
111
|
-
next_chunk = next(text_iterator, None)
|
112
|
-
|
113
|
-
try:
|
114
|
-
# Receive responses from the WebSocket with a small timeout
|
115
|
-
response = json.loads(
|
116
|
-
self._websocket.websocket.recv(timeout=0.001)
|
117
|
-
) # 1ms timeout for nearly non-blocking receive
|
118
|
-
if response["context_id"] != self._context_id:
|
119
|
-
pass
|
120
|
-
if "error" in response:
|
121
|
-
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
122
|
-
if response["done"]:
|
123
|
-
break
|
124
|
-
if "data" in response and response["data"]:
|
125
|
-
yield self._websocket._convert_response(
|
126
|
-
response=response, include_context_id=True
|
127
|
-
)
|
128
|
-
except TimeoutError:
|
129
|
-
pass
|
130
|
-
|
131
|
-
# Continuously receive from WebSocket until the next text chunk is available
|
132
|
-
while next_chunk == text_iterator.get_sentinel():
|
133
|
-
try:
|
134
|
-
response = json.loads(self._websocket.websocket.recv(timeout=0.001))
|
135
|
-
if response["context_id"] != self._context_id:
|
136
|
-
continue
|
137
|
-
if "error" in response:
|
138
|
-
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
139
|
-
if response["done"]:
|
140
|
-
break
|
141
|
-
if "data" in response and response["data"]:
|
142
|
-
yield self._websocket._convert_response(
|
143
|
-
response=response, include_context_id=True
|
144
|
-
)
|
145
|
-
except TimeoutError:
|
146
|
-
pass
|
147
|
-
next_chunk = next(text_iterator, None)
|
148
|
-
|
149
|
-
# Send final message if all input text chunks are exhausted
|
150
|
-
if next_chunk is None:
|
151
|
-
request_body["transcript"] = ""
|
152
|
-
request_body["continue"] = False
|
153
|
-
self._websocket.websocket.send(json.dumps(request_body))
|
154
|
-
break
|
155
|
-
|
156
|
-
# Receive remaining messages from the WebSocket until "done" is received
|
157
|
-
while True:
|
158
|
-
response = json.loads(self._websocket.websocket.recv())
|
159
|
-
if response["context_id"] != self._context_id:
|
160
|
-
continue
|
161
|
-
if "error" in response:
|
162
|
-
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
163
|
-
if response["done"]:
|
164
|
-
break
|
165
|
-
yield self._websocket._convert_response(response=response, include_context_id=True)
|
166
|
-
|
167
|
-
except Exception as e:
|
168
|
-
self._websocket.close()
|
169
|
-
raise RuntimeError(f"Failed to generate audio. {e}")
|
170
|
-
|
171
|
-
def _close(self):
|
172
|
-
"""Closes the context. Automatically called when a done message is received for this context."""
|
173
|
-
self._websocket._remove_context(self._context_id)
|
174
|
-
|
175
|
-
def is_closed(self):
|
176
|
-
"""Check if the context is closed or not. Returns True if closed."""
|
177
|
-
return self._context_id not in self._websocket._contexts
|
178
|
-
|
179
|
-
|
180
|
-
class _WebSocket:
|
181
|
-
"""This class contains methods to generate audio using WebSocket. Ideal for low-latency audio generation.
|
182
|
-
|
183
|
-
Usage:
|
184
|
-
>>> ws = client.tts.websocket()
|
185
|
-
>>> for audio_chunk in ws.send(
|
186
|
-
... model_id="sonic-english", transcript="Hello world!", voice_embedding=embedding,
|
187
|
-
... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100},
|
188
|
-
... context_id=context_id, stream=True
|
189
|
-
... ):
|
190
|
-
... audio = audio_chunk["audio"]
|
191
|
-
"""
|
192
|
-
|
193
|
-
def __init__(
|
194
|
-
self,
|
195
|
-
ws_url: str,
|
196
|
-
api_key: str,
|
197
|
-
cartesia_version: str,
|
198
|
-
):
|
199
|
-
self.ws_url = ws_url
|
200
|
-
self.api_key = api_key
|
201
|
-
self.cartesia_version = cartesia_version
|
202
|
-
self.websocket = None
|
203
|
-
self._contexts: Set[str] = set()
|
204
|
-
|
205
|
-
def __del__(self):
|
206
|
-
try:
|
207
|
-
self.close()
|
208
|
-
except Exception as e:
|
209
|
-
raise RuntimeError("Failed to close WebSocket: ", e)
|
210
|
-
|
211
|
-
def connect(self):
|
212
|
-
"""This method connects to the WebSocket if it is not already connected.
|
213
|
-
|
214
|
-
Raises:
|
215
|
-
RuntimeError: If the connection to the WebSocket fails.
|
216
|
-
"""
|
217
|
-
if not IS_WEBSOCKET_SYNC_AVAILABLE:
|
218
|
-
raise ImportError(
|
219
|
-
"The synchronous WebSocket client is not available. Please ensure that you have 'websockets>=12.0' or compatible version installed."
|
220
|
-
)
|
221
|
-
if self.websocket is None or self._is_websocket_closed():
|
222
|
-
route = "tts/websocket"
|
223
|
-
try:
|
224
|
-
self.websocket = connect(
|
225
|
-
f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
|
226
|
-
)
|
227
|
-
except Exception as e:
|
228
|
-
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
229
|
-
|
230
|
-
def _is_websocket_closed(self):
|
231
|
-
return self.websocket.socket.fileno() == -1
|
232
|
-
|
233
|
-
def close(self):
|
234
|
-
"""This method closes the WebSocket connection. *Highly* recommended to call this method when done using the WebSocket."""
|
235
|
-
if self.websocket and not self._is_websocket_closed():
|
236
|
-
self.websocket.close()
|
237
|
-
|
238
|
-
if self._contexts:
|
239
|
-
self._contexts.clear()
|
240
|
-
|
241
|
-
def _convert_response(
|
242
|
-
self, response: Dict[str, any], include_context_id: bool, include_flush_id: bool = False
|
243
|
-
) -> Dict[str, Any]:
|
244
|
-
out = {}
|
245
|
-
if response["type"] == EventType.AUDIO:
|
246
|
-
out["audio"] = base64.b64decode(response["data"])
|
247
|
-
elif response["type"] == EventType.TIMESTAMPS:
|
248
|
-
out["word_timestamps"] = response["word_timestamps"]
|
249
|
-
|
250
|
-
if include_context_id:
|
251
|
-
out["context_id"] = response["context_id"]
|
252
|
-
|
253
|
-
if include_flush_id and "flush_id" in response:
|
254
|
-
out["flush_id"] = response["flush_id"]
|
255
|
-
|
256
|
-
return out
|
257
|
-
|
258
|
-
def send(
|
259
|
-
self,
|
260
|
-
model_id: str,
|
261
|
-
transcript: str,
|
262
|
-
output_format: dict,
|
263
|
-
voice_id: Optional[str] = None,
|
264
|
-
voice_embedding: Optional[List[float]] = None,
|
265
|
-
context_id: Optional[str] = None,
|
266
|
-
duration: Optional[int] = None,
|
267
|
-
language: Optional[str] = None,
|
268
|
-
stream: bool = True,
|
269
|
-
add_timestamps: bool = False,
|
270
|
-
_experimental_voice_controls: Optional[VoiceControls] = None,
|
271
|
-
) -> Union[bytes, Generator[bytes, None, None]]:
|
272
|
-
"""Send a request to the WebSocket to generate audio.
|
273
|
-
|
274
|
-
Args:
|
275
|
-
model_id: The ID of the model to use for generating audio.
|
276
|
-
transcript: The text to convert to speech.
|
277
|
-
output_format: A dictionary containing the details of the output format.
|
278
|
-
voice_id: The ID of the voice to use for generating audio.
|
279
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
280
|
-
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
281
|
-
duration: The duration of the audio in seconds.
|
282
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
283
|
-
stream: Whether to stream the audio or not.
|
284
|
-
add_timestamps: Whether to return word-level timestamps.
|
285
|
-
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
286
|
-
Note: This is an experimental feature and may change rapidly in future releases.
|
287
|
-
|
288
|
-
Returns:
|
289
|
-
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
290
|
-
If `stream` is False, the method returns a dictionary.
|
291
|
-
Both the generator and the dictionary contain the following key(s):
|
292
|
-
- audio: The audio as bytes.
|
293
|
-
- context_id: The context ID for the request.
|
294
|
-
"""
|
295
|
-
self.connect()
|
296
|
-
|
297
|
-
if context_id is None:
|
298
|
-
context_id = str(uuid.uuid4())
|
299
|
-
|
300
|
-
request_body = _construct_tts_request(
|
301
|
-
model_id=model_id,
|
302
|
-
transcript=transcript,
|
303
|
-
output_format=output_format,
|
304
|
-
voice_id=voice_id,
|
305
|
-
voice_embedding=voice_embedding,
|
306
|
-
context_id=context_id,
|
307
|
-
duration=duration,
|
308
|
-
language=language,
|
309
|
-
add_timestamps=add_timestamps,
|
310
|
-
_experimental_voice_controls=_experimental_voice_controls,
|
311
|
-
)
|
312
|
-
|
313
|
-
generator = self._websocket_generator(request_body)
|
314
|
-
|
315
|
-
if stream:
|
316
|
-
return generator
|
317
|
-
|
318
|
-
chunks = []
|
319
|
-
word_timestamps = defaultdict(list)
|
320
|
-
for chunk in generator:
|
321
|
-
if "audio" in chunk:
|
322
|
-
chunks.append(chunk["audio"])
|
323
|
-
if add_timestamps and "word_timestamps" in chunk:
|
324
|
-
for k, v in chunk["word_timestamps"].items():
|
325
|
-
word_timestamps[k].extend(v)
|
326
|
-
out = {"audio": b"".join(chunks), "context_id": context_id}
|
327
|
-
if add_timestamps:
|
328
|
-
out["word_timestamps"] = word_timestamps
|
329
|
-
return out
|
330
|
-
|
331
|
-
def _websocket_generator(self, request_body: Dict[str, Any]):
|
332
|
-
self.websocket.send(json.dumps(request_body))
|
333
|
-
|
334
|
-
try:
|
335
|
-
while True:
|
336
|
-
response = json.loads(self.websocket.recv())
|
337
|
-
if "error" in response:
|
338
|
-
raise RuntimeError(f"Error generating audio:\n{response['error']}")
|
339
|
-
if response["done"]:
|
340
|
-
break
|
341
|
-
yield self._convert_response(response=response, include_context_id=True)
|
342
|
-
except Exception as e:
|
343
|
-
# Close the websocket connection if an error occurs.
|
344
|
-
self.close()
|
345
|
-
raise RuntimeError(f"Failed to generate audio. {response}") from e
|
346
|
-
|
347
|
-
def _remove_context(self, context_id: str):
|
348
|
-
if context_id in self._contexts:
|
349
|
-
self._contexts.remove(context_id)
|
350
|
-
|
351
|
-
def context(self, context_id: Optional[str] = None) -> _TTSContext:
|
352
|
-
if context_id in self._contexts:
|
353
|
-
raise ValueError(f"Context for context ID {context_id} already exists.")
|
354
|
-
if context_id is None:
|
355
|
-
context_id = str(uuid.uuid4())
|
356
|
-
if context_id not in self._contexts:
|
357
|
-
self._contexts.add(context_id)
|
358
|
-
return _TTSContext(context_id, self)
|
cartesia/async_client.py
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from types import TracebackType
|
3
|
-
from typing import Optional, Union
|
4
|
-
|
5
|
-
import aiohttp
|
6
|
-
|
7
|
-
from cartesia._constants import DEFAULT_NUM_CONNECTIONS, DEFAULT_TIMEOUT
|
8
|
-
from cartesia.async_tts import AsyncTTS
|
9
|
-
from cartesia.client import Cartesia
|
10
|
-
|
11
|
-
|
12
|
-
class AsyncCartesia(Cartesia):
|
13
|
-
"""The asynchronous version of the Cartesia client."""
|
14
|
-
|
15
|
-
def __init__(
|
16
|
-
self,
|
17
|
-
*,
|
18
|
-
api_key: Optional[str] = None,
|
19
|
-
base_url: Optional[str] = None,
|
20
|
-
timeout: float = DEFAULT_TIMEOUT,
|
21
|
-
max_num_connections: int = DEFAULT_NUM_CONNECTIONS,
|
22
|
-
):
|
23
|
-
"""
|
24
|
-
Args:
|
25
|
-
api_key: See :class:`Cartesia`.
|
26
|
-
base_url: See :class:`Cartesia`.
|
27
|
-
timeout: See :class:`Cartesia`.
|
28
|
-
max_num_connections: The maximum number of concurrent connections to use for the client.
|
29
|
-
This is used to limit the number of connections that can be made to the server.
|
30
|
-
"""
|
31
|
-
self._session = None
|
32
|
-
self._loop = None
|
33
|
-
super().__init__(api_key=api_key, base_url=base_url, timeout=timeout)
|
34
|
-
self.max_num_connections = max_num_connections
|
35
|
-
self.tts = AsyncTTS(
|
36
|
-
api_key=self.api_key,
|
37
|
-
base_url=self._base_url,
|
38
|
-
timeout=self.timeout,
|
39
|
-
get_session=self._get_session,
|
40
|
-
)
|
41
|
-
|
42
|
-
async def _get_session(self):
|
43
|
-
current_loop = asyncio.get_event_loop()
|
44
|
-
if self._loop is not current_loop:
|
45
|
-
# If the loop has changed, close the session and create a new one.
|
46
|
-
await self.close()
|
47
|
-
if self._session is None or self._session.closed:
|
48
|
-
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
49
|
-
connector = aiohttp.TCPConnector(limit=self.max_num_connections)
|
50
|
-
self._session = aiohttp.ClientSession(timeout=timeout, connector=connector)
|
51
|
-
self._loop = current_loop
|
52
|
-
return self._session
|
53
|
-
|
54
|
-
async def close(self):
|
55
|
-
"""This method closes the session.
|
56
|
-
|
57
|
-
It is *strongly* recommended to call this method when you are done using the client.
|
58
|
-
"""
|
59
|
-
if self._session is not None and not self._session.closed:
|
60
|
-
await self._session.close()
|
61
|
-
|
62
|
-
def __del__(self):
|
63
|
-
try:
|
64
|
-
loop = asyncio.get_running_loop()
|
65
|
-
except RuntimeError:
|
66
|
-
loop = None
|
67
|
-
|
68
|
-
if loop is None:
|
69
|
-
asyncio.run(self.close())
|
70
|
-
elif loop.is_running():
|
71
|
-
loop.create_task(self.close())
|
72
|
-
|
73
|
-
async def __aenter__(self):
|
74
|
-
return self
|
75
|
-
|
76
|
-
async def __aexit__(
|
77
|
-
self,
|
78
|
-
exc_type: Union[type, None],
|
79
|
-
exc: Union[BaseException, None],
|
80
|
-
exc_tb: Union[TracebackType, None],
|
81
|
-
):
|
82
|
-
await self.close()
|