cartesia 1.4.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +302 -3
- cartesia/api_status/__init__.py +6 -0
- cartesia/api_status/client.py +104 -0
- cartesia/api_status/requests/__init__.py +5 -0
- cartesia/api_status/requests/api_info.py +8 -0
- cartesia/api_status/types/__init__.py +5 -0
- cartesia/api_status/types/api_info.py +20 -0
- cartesia/base_client.py +156 -0
- cartesia/client.py +163 -40
- cartesia/core/__init__.py +50 -0
- cartesia/core/api_error.py +15 -0
- cartesia/core/client_wrapper.py +55 -0
- cartesia/core/datetime_utils.py +28 -0
- cartesia/core/file.py +67 -0
- cartesia/core/http_client.py +499 -0
- cartesia/core/jsonable_encoder.py +101 -0
- cartesia/core/pagination.py +88 -0
- cartesia/core/pydantic_utilities.py +296 -0
- cartesia/core/query_encoder.py +58 -0
- cartesia/core/remove_none_from_dict.py +11 -0
- cartesia/core/request_options.py +35 -0
- cartesia/core/serialization.py +272 -0
- cartesia/datasets/__init__.py +24 -0
- cartesia/datasets/requests/__init__.py +15 -0
- cartesia/datasets/requests/create_dataset_request.py +7 -0
- cartesia/datasets/requests/dataset.py +9 -0
- cartesia/datasets/requests/dataset_file.py +9 -0
- cartesia/datasets/requests/paginated_dataset_files.py +10 -0
- cartesia/datasets/requests/paginated_datasets.py +10 -0
- cartesia/datasets/types/__init__.py +17 -0
- cartesia/datasets/types/create_dataset_request.py +19 -0
- cartesia/datasets/types/dataset.py +21 -0
- cartesia/datasets/types/dataset_file.py +21 -0
- cartesia/datasets/types/file_purpose.py +5 -0
- cartesia/datasets/types/paginated_dataset_files.py +21 -0
- cartesia/datasets/types/paginated_datasets.py +21 -0
- cartesia/embedding/__init__.py +5 -0
- cartesia/embedding/types/__init__.py +5 -0
- cartesia/embedding/types/embedding.py +201 -0
- cartesia/environment.py +7 -0
- cartesia/infill/__init__.py +2 -0
- cartesia/infill/client.py +318 -0
- cartesia/tts/__init__.py +167 -0
- cartesia/{_async_websocket.py → tts/_async_websocket.py} +212 -85
- cartesia/tts/_websocket.py +479 -0
- cartesia/tts/client.py +407 -0
- cartesia/tts/requests/__init__.py +76 -0
- cartesia/tts/requests/cancel_context_request.py +17 -0
- cartesia/tts/requests/controls.py +11 -0
- cartesia/tts/requests/generation_request.py +58 -0
- cartesia/tts/requests/mp_3_output_format.py +11 -0
- cartesia/tts/requests/output_format.py +30 -0
- cartesia/tts/requests/phoneme_timestamps.py +10 -0
- cartesia/tts/requests/raw_output_format.py +11 -0
- cartesia/tts/requests/speed.py +7 -0
- cartesia/tts/requests/tts_request.py +24 -0
- cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
- cartesia/tts/requests/tts_request_id_specifier.py +16 -0
- cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
- cartesia/tts/requests/wav_output_format.py +7 -0
- cartesia/tts/requests/web_socket_base_response.py +11 -0
- cartesia/tts/requests/web_socket_chunk_response.py +11 -0
- cartesia/tts/requests/web_socket_done_response.py +7 -0
- cartesia/tts/requests/web_socket_error_response.py +7 -0
- cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
- cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
- cartesia/tts/requests/web_socket_request.py +7 -0
- cartesia/tts/requests/web_socket_response.py +70 -0
- cartesia/tts/requests/web_socket_stream_options.py +8 -0
- cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
- cartesia/tts/requests/web_socket_tts_output.py +18 -0
- cartesia/tts/requests/web_socket_tts_request.py +25 -0
- cartesia/tts/requests/word_timestamps.py +10 -0
- cartesia/tts/socket_client.py +302 -0
- cartesia/tts/types/__init__.py +90 -0
- cartesia/tts/types/cancel_context_request.py +28 -0
- cartesia/tts/types/context_id.py +3 -0
- cartesia/tts/types/controls.py +22 -0
- cartesia/tts/types/emotion.py +34 -0
- cartesia/tts/types/flush_id.py +3 -0
- cartesia/tts/types/generation_request.py +71 -0
- cartesia/tts/types/mp_3_output_format.py +23 -0
- cartesia/tts/types/natural_specifier.py +5 -0
- cartesia/tts/types/numerical_specifier.py +3 -0
- cartesia/tts/types/output_format.py +58 -0
- cartesia/tts/types/phoneme_timestamps.py +21 -0
- cartesia/tts/types/raw_encoding.py +5 -0
- cartesia/tts/types/raw_output_format.py +22 -0
- cartesia/tts/types/speed.py +7 -0
- cartesia/tts/types/supported_language.py +7 -0
- cartesia/tts/types/tts_request.py +35 -0
- cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
- cartesia/tts/types/tts_request_id_specifier.py +27 -0
- cartesia/tts/types/tts_request_voice_specifier.py +7 -0
- cartesia/tts/types/wav_output_format.py +17 -0
- cartesia/tts/types/web_socket_base_response.py +22 -0
- cartesia/tts/types/web_socket_chunk_response.py +22 -0
- cartesia/tts/types/web_socket_done_response.py +17 -0
- cartesia/tts/types/web_socket_error_response.py +19 -0
- cartesia/tts/types/web_socket_flush_done_response.py +21 -0
- cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_raw_output_format.py +22 -0
- cartesia/tts/types/web_socket_request.py +7 -0
- cartesia/tts/types/web_socket_response.py +125 -0
- cartesia/tts/types/web_socket_stream_options.py +19 -0
- cartesia/tts/types/web_socket_timestamps_response.py +20 -0
- cartesia/tts/types/web_socket_tts_output.py +29 -0
- cartesia/tts/types/web_socket_tts_request.py +37 -0
- cartesia/tts/types/word_timestamps.py +21 -0
- cartesia/{_constants.py → tts/utils/constants.py} +2 -2
- cartesia/tts/utils/tts.py +64 -0
- cartesia/tts/utils/types.py +70 -0
- cartesia/version.py +3 -1
- cartesia/voice_changer/__init__.py +27 -0
- cartesia/voice_changer/client.py +395 -0
- cartesia/voice_changer/requests/__init__.py +15 -0
- cartesia/voice_changer/requests/streaming_response.py +38 -0
- cartesia/voice_changer/types/__init__.py +17 -0
- cartesia/voice_changer/types/output_format_container.py +5 -0
- cartesia/voice_changer/types/streaming_response.py +64 -0
- cartesia/voices/__init__.py +81 -0
- cartesia/voices/client.py +1218 -0
- cartesia/voices/requests/__init__.py +29 -0
- cartesia/voices/requests/create_voice_request.py +23 -0
- cartesia/voices/requests/embedding_response.py +8 -0
- cartesia/voices/requests/embedding_specifier.py +10 -0
- cartesia/voices/requests/get_voices_response.py +24 -0
- cartesia/voices/requests/id_specifier.py +10 -0
- cartesia/voices/requests/localize_dialect.py +11 -0
- cartesia/voices/requests/localize_voice_request.py +28 -0
- cartesia/voices/requests/mix_voice_specifier.py +7 -0
- cartesia/voices/requests/mix_voices_request.py +9 -0
- cartesia/voices/requests/update_voice_request.py +15 -0
- cartesia/voices/requests/voice.py +43 -0
- cartesia/voices/requests/voice_metadata.py +36 -0
- cartesia/voices/types/__init__.py +53 -0
- cartesia/voices/types/base_voice_id.py +5 -0
- cartesia/voices/types/clone_mode.py +5 -0
- cartesia/voices/types/create_voice_request.py +34 -0
- cartesia/voices/types/embedding_response.py +20 -0
- cartesia/voices/types/embedding_specifier.py +22 -0
- cartesia/voices/types/gender.py +5 -0
- cartesia/voices/types/gender_presentation.py +5 -0
- cartesia/voices/types/get_voices_response.py +34 -0
- cartesia/voices/types/id_specifier.py +22 -0
- cartesia/voices/types/localize_dialect.py +11 -0
- cartesia/voices/types/localize_english_dialect.py +5 -0
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_portuguese_dialect.py +5 -0
- cartesia/voices/types/localize_spanish_dialect.py +5 -0
- cartesia/voices/types/localize_target_language.py +7 -0
- cartesia/voices/types/localize_voice_request.py +39 -0
- cartesia/voices/types/mix_voice_specifier.py +7 -0
- cartesia/voices/types/mix_voices_request.py +20 -0
- cartesia/voices/types/update_voice_request.py +27 -0
- cartesia/voices/types/voice.py +54 -0
- cartesia/voices/types/voice_expand_options.py +5 -0
- cartesia/voices/types/voice_id.py +3 -0
- cartesia/voices/types/voice_metadata.py +48 -0
- cartesia/voices/types/weight.py +3 -0
- cartesia-2.0.0.dist-info/METADATA +414 -0
- cartesia-2.0.0.dist-info/RECORD +165 -0
- {cartesia-1.4.0.dist-info → cartesia-2.0.0.dist-info}/WHEEL +1 -1
- cartesia/_async_sse.py +0 -95
- cartesia/_logger.py +0 -3
- cartesia/_sse.py +0 -143
- cartesia/_types.py +0 -70
- cartesia/_websocket.py +0 -358
- cartesia/async_client.py +0 -82
- cartesia/async_tts.py +0 -176
- cartesia/resource.py +0 -44
- cartesia/tts.py +0 -292
- cartesia/utils/deprecated.py +0 -55
- cartesia/utils/retry.py +0 -87
- cartesia/utils/tts.py +0 -78
- cartesia/voices.py +0 -204
- cartesia-1.4.0.dist-info/METADATA +0 -663
- cartesia-1.4.0.dist-info/RECORD +0 -23
- cartesia-1.4.0.dist-info/licenses/LICENSE.md +0 -21
- /cartesia/{utils/__init__.py → py.typed} +0 -0
@@ -1,4 +1,6 @@
|
|
1
1
|
import asyncio
|
2
|
+
import json
|
3
|
+
import typing
|
2
4
|
import uuid
|
3
5
|
from collections import defaultdict
|
4
6
|
from types import TracebackType
|
@@ -6,11 +8,27 @@ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
|
|
6
8
|
|
7
9
|
import aiohttp
|
8
10
|
|
9
|
-
from cartesia.
|
10
|
-
from cartesia.
|
11
|
-
from cartesia.
|
12
|
-
|
13
|
-
|
11
|
+
from cartesia.tts.requests import TtsRequestVoiceSpecifierParams
|
12
|
+
from cartesia.tts.requests.output_format import OutputFormatParams
|
13
|
+
from cartesia.tts.types import (
|
14
|
+
WebSocketResponse,
|
15
|
+
WebSocketResponse_Done,
|
16
|
+
WebSocketResponse_Error,
|
17
|
+
WebSocketResponse_FlushDone,
|
18
|
+
WebSocketTtsOutput,
|
19
|
+
WordTimestamps,
|
20
|
+
PhonemeTimestamps,
|
21
|
+
)
|
22
|
+
|
23
|
+
from ..core.pydantic_utilities import parse_obj_as
|
24
|
+
from ._websocket import TtsWebsocket
|
25
|
+
from .types.generation_request import GenerationRequest
|
26
|
+
from .utils.constants import (
|
27
|
+
DEFAULT_MODEL_ID,
|
28
|
+
DEFAULT_OUTPUT_FORMAT,
|
29
|
+
DEFAULT_VOICE_EMBEDDING,
|
30
|
+
)
|
31
|
+
from .utils.tts import get_output_format
|
14
32
|
|
15
33
|
|
16
34
|
class _AsyncTTSContext:
|
@@ -26,7 +44,9 @@ class _AsyncTTSContext:
|
|
26
44
|
|
27
45
|
"""
|
28
46
|
|
29
|
-
def __init__(
|
47
|
+
def __init__(
|
48
|
+
self, context_id: str, websocket: "AsyncTtsWebsocket", timeout: float = 30
|
49
|
+
):
|
30
50
|
self._context_id = context_id
|
31
51
|
self._websocket = websocket
|
32
52
|
self.timeout = timeout
|
@@ -38,60 +58,79 @@ class _AsyncTTSContext:
|
|
38
58
|
|
39
59
|
async def send(
|
40
60
|
self,
|
61
|
+
*,
|
41
62
|
model_id: str,
|
42
63
|
transcript: str,
|
43
|
-
output_format:
|
44
|
-
|
45
|
-
voice_embedding: Optional[List[float]] = None,
|
64
|
+
output_format: OutputFormatParams,
|
65
|
+
voice: TtsRequestVoiceSpecifierParams,
|
46
66
|
context_id: Optional[str] = None,
|
47
|
-
continue_: bool = False,
|
48
|
-
flush: bool = False,
|
49
67
|
duration: Optional[int] = None,
|
50
68
|
language: Optional[str] = None,
|
69
|
+
stream: bool = True,
|
51
70
|
add_timestamps: bool = False,
|
52
|
-
|
71
|
+
add_phoneme_timestamps: bool = False,
|
72
|
+
use_original_timestamps: bool = False,
|
73
|
+
continue_: bool = False,
|
74
|
+
flush: bool = False,
|
53
75
|
) -> None:
|
54
76
|
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
55
77
|
|
56
78
|
Args:
|
57
|
-
|
58
|
-
transcript: The text to convert to speech.
|
59
|
-
output_format: A dictionary containing the details of the output format.
|
60
|
-
voice_id: The ID of the voice to use for generating audio.
|
61
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
62
|
-
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
63
|
-
continue_: Whether to continue the audio generation from the previous transcript or not.
|
64
|
-
flush: Whether to trigger a manual flush for the current context's generation.
|
65
|
-
duration: The duration of the audio in seconds.
|
66
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
|
67
|
-
add_timestamps: Whether to return word-level timestamps.
|
68
|
-
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
69
|
-
Note: This is an experimental feature and may change rapidly in future releases.
|
79
|
+
request: The request to generate audio.
|
70
80
|
|
71
81
|
Returns:
|
72
82
|
None.
|
73
83
|
"""
|
74
|
-
if context_id is not None and context_id != self._context_id:
|
75
|
-
raise ValueError("Context ID does not match the context ID of the current context.")
|
76
|
-
if continue_ and transcript == "" and not flush:
|
77
|
-
raise ValueError("Transcript cannot be empty when continue_ is True.")
|
78
|
-
|
79
84
|
await self._websocket.connect()
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
85
|
+
assert self._websocket.websocket is not None, "WebSocket is not connected"
|
86
|
+
|
87
|
+
request_body = {
|
88
|
+
"model_id": model_id,
|
89
|
+
"transcript": transcript,
|
90
|
+
"output_format": (
|
91
|
+
output_format
|
92
|
+
if isinstance(output_format, dict)
|
93
|
+
else output_format.dict()
|
94
|
+
),
|
95
|
+
"voice": (voice if isinstance(voice, dict) else voice.dict()),
|
96
|
+
"context_id": self._context_id,
|
97
|
+
}
|
98
|
+
if context_id is not None:
|
99
|
+
request_body["context_id"] = context_id
|
100
|
+
if duration is not None:
|
101
|
+
request_body["duration"] = duration
|
102
|
+
if language is not None:
|
103
|
+
request_body["language"] = language
|
104
|
+
if stream:
|
105
|
+
request_body["stream"] = stream
|
106
|
+
if add_timestamps:
|
107
|
+
request_body["add_timestamps"] = add_timestamps
|
108
|
+
if add_phoneme_timestamps:
|
109
|
+
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
110
|
+
if use_original_timestamps:
|
111
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
112
|
+
if continue_:
|
113
|
+
request_body["continue"] = continue_
|
114
|
+
if flush:
|
115
|
+
request_body["flush"] = flush
|
116
|
+
|
117
|
+
if (
|
118
|
+
"context_id" in request_body
|
119
|
+
and request_body["context_id"] is not None
|
120
|
+
and request_body["context_id"] != self._context_id
|
121
|
+
):
|
122
|
+
raise ValueError(
|
123
|
+
"Context ID does not match the context ID of the current context."
|
124
|
+
)
|
125
|
+
request_body["context_id"] = self._context_id
|
126
|
+
|
127
|
+
if (
|
128
|
+
"continue" in request_body
|
129
|
+
and request_body["continue"]
|
130
|
+
and request_body["transcript"] == ""
|
131
|
+
and ("flush" in request_body and not request_body["flush"])
|
132
|
+
):
|
133
|
+
raise ValueError("Transcript cannot be empty when continue_ is True.")
|
95
134
|
|
96
135
|
await self._websocket.websocket.send_json(request_body)
|
97
136
|
|
@@ -103,8 +142,11 @@ class _AsyncTTSContext:
|
|
103
142
|
await self.send(
|
104
143
|
model_id=DEFAULT_MODEL_ID,
|
105
144
|
transcript="",
|
106
|
-
output_format=
|
107
|
-
|
145
|
+
output_format=get_output_format(DEFAULT_OUTPUT_FORMAT),
|
146
|
+
voice={
|
147
|
+
"mode": "embedding",
|
148
|
+
"embedding": DEFAULT_VOICE_EMBEDDING,
|
149
|
+
},
|
108
150
|
context_id=self._context_id,
|
109
151
|
continue_=False,
|
110
152
|
)
|
@@ -114,8 +156,11 @@ class _AsyncTTSContext:
|
|
114
156
|
await self.send(
|
115
157
|
model_id=DEFAULT_MODEL_ID,
|
116
158
|
transcript="",
|
117
|
-
output_format=
|
118
|
-
|
159
|
+
output_format=get_output_format(DEFAULT_OUTPUT_FORMAT),
|
160
|
+
voice={
|
161
|
+
"mode": "embedding",
|
162
|
+
"embedding": DEFAULT_VOICE_EMBEDDING,
|
163
|
+
},
|
119
164
|
context_id=self._context_id,
|
120
165
|
continue_=True,
|
121
166
|
flush=True,
|
@@ -134,11 +179,23 @@ class _AsyncTTSContext:
|
|
134
179
|
response = await self._websocket._get_message(
|
135
180
|
self._context_id, timeout=self.timeout, flush_id=flush_id
|
136
181
|
)
|
137
|
-
|
138
|
-
|
139
|
-
|
182
|
+
response_obj = typing.cast(
|
183
|
+
WebSocketResponse,
|
184
|
+
parse_obj_as(
|
185
|
+
type_=WebSocketResponse, object_=response # type: ignore
|
186
|
+
),
|
187
|
+
)
|
188
|
+
if isinstance(response_obj, WebSocketResponse_Error):
|
189
|
+
raise RuntimeError(
|
190
|
+
f"Error generating audio:\n{response_obj.error}"
|
191
|
+
)
|
192
|
+
if isinstance(response_obj, WebSocketResponse_Done) or isinstance(
|
193
|
+
response_obj, WebSocketResponse_FlushDone
|
194
|
+
):
|
140
195
|
break
|
141
|
-
yield self._websocket._convert_response(
|
196
|
+
yield self._websocket._convert_response(
|
197
|
+
response_obj, include_context_id=True
|
198
|
+
)
|
142
199
|
except Exception as e:
|
143
200
|
if isinstance(e, asyncio.TimeoutError):
|
144
201
|
raise RuntimeError("Timeout while waiting for audio chunk")
|
@@ -146,7 +203,7 @@ class _AsyncTTSContext:
|
|
146
203
|
|
147
204
|
return generator
|
148
205
|
|
149
|
-
async def receive(self) -> AsyncGenerator[
|
206
|
+
async def receive(self) -> AsyncGenerator[WebSocketTtsOutput, None]:
|
150
207
|
"""Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
|
151
208
|
|
152
209
|
Returns:
|
@@ -157,11 +214,21 @@ class _AsyncTTSContext:
|
|
157
214
|
response = await self._websocket._get_message(
|
158
215
|
self._context_id, timeout=self.timeout
|
159
216
|
)
|
160
|
-
|
161
|
-
|
162
|
-
|
217
|
+
response_obj = typing.cast(
|
218
|
+
WebSocketResponse,
|
219
|
+
parse_obj_as(
|
220
|
+
type_=WebSocketResponse, # type: ignore
|
221
|
+
object_=response,
|
222
|
+
),
|
223
|
+
)
|
224
|
+
|
225
|
+
if isinstance(response_obj, WebSocketResponse_Error):
|
226
|
+
raise RuntimeError(f"Error generating audio:\n{response_obj.error}")
|
227
|
+
if isinstance(response_obj, WebSocketResponse_Done):
|
163
228
|
break
|
164
|
-
yield self._websocket._convert_response(
|
229
|
+
yield self._websocket._convert_response(
|
230
|
+
response_obj, include_context_id=True
|
231
|
+
)
|
165
232
|
except Exception as e:
|
166
233
|
if isinstance(e, asyncio.TimeoutError):
|
167
234
|
raise RuntimeError("Timeout while waiting for audio chunk")
|
@@ -169,6 +236,11 @@ class _AsyncTTSContext:
|
|
169
236
|
finally:
|
170
237
|
self._close()
|
171
238
|
|
239
|
+
async def cancel(self):
|
240
|
+
"""Cancel the context. This will stop the generation of audio for this context."""
|
241
|
+
await self._websocket.websocket.send_json({"context_id": self._context_id, "cancel": True})
|
242
|
+
self._close()
|
243
|
+
|
172
244
|
def _close(self) -> None:
|
173
245
|
"""Closes the context. Automatically called when a done message is received for this context."""
|
174
246
|
self._websocket._remove_context(self._context_id)
|
@@ -192,7 +264,7 @@ class _AsyncTTSContext:
|
|
192
264
|
self._close()
|
193
265
|
|
194
266
|
|
195
|
-
class
|
267
|
+
class AsyncTtsWebsocket(TtsWebsocket):
|
196
268
|
"""This class contains methods to generate audio using WebSocket asynchronously."""
|
197
269
|
|
198
270
|
def __init__(
|
@@ -200,8 +272,8 @@ class _AsyncWebSocket(_WebSocket):
|
|
200
272
|
ws_url: str,
|
201
273
|
api_key: str,
|
202
274
|
cartesia_version: str,
|
203
|
-
timeout: float,
|
204
275
|
get_session: Callable[[], Optional[aiohttp.ClientSession]],
|
276
|
+
timeout: float = 30,
|
205
277
|
):
|
206
278
|
"""
|
207
279
|
Args:
|
@@ -216,7 +288,7 @@ class _AsyncWebSocket(_WebSocket):
|
|
216
288
|
self._get_session = get_session
|
217
289
|
self.websocket = None
|
218
290
|
self._context_queues: Dict[str, List[asyncio.Queue]] = {}
|
219
|
-
self._processing_task: asyncio.Task = None
|
291
|
+
self._processing_task: Optional[asyncio.Task] = None
|
220
292
|
|
221
293
|
def __del__(self):
|
222
294
|
try:
|
@@ -237,7 +309,26 @@ class _AsyncWebSocket(_WebSocket):
|
|
237
309
|
try:
|
238
310
|
self.websocket = await session.ws_connect(url)
|
239
311
|
except Exception as e:
|
240
|
-
|
312
|
+
# Extract status code if available
|
313
|
+
status_code = None
|
314
|
+
error_message = str(e)
|
315
|
+
|
316
|
+
if hasattr(e, 'status') and e.status is not None:
|
317
|
+
status_code = e.status
|
318
|
+
|
319
|
+
# Create a meaningful error message based on status code
|
320
|
+
if status_code == 402:
|
321
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
322
|
+
elif status_code == 401:
|
323
|
+
error_message = "Unauthorized. Please check your API key."
|
324
|
+
elif status_code == 403:
|
325
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
326
|
+
elif status_code == 404:
|
327
|
+
error_message = "Not found. The requested resource doesn't exist."
|
328
|
+
|
329
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
330
|
+
else:
|
331
|
+
raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
|
241
332
|
|
242
333
|
def _is_websocket_closed(self):
|
243
334
|
return self.websocket.closed
|
@@ -268,18 +359,19 @@ class _AsyncWebSocket(_WebSocket):
|
|
268
359
|
|
269
360
|
async def send(
|
270
361
|
self,
|
362
|
+
*,
|
271
363
|
model_id: str,
|
272
364
|
transcript: str,
|
273
|
-
output_format:
|
274
|
-
|
275
|
-
voice_embedding: Optional[List[float]] = None,
|
365
|
+
output_format: OutputFormatParams,
|
366
|
+
voice: TtsRequestVoiceSpecifierParams,
|
276
367
|
context_id: Optional[str] = None,
|
277
368
|
duration: Optional[int] = None,
|
278
369
|
language: Optional[str] = None,
|
279
370
|
stream: bool = True,
|
280
371
|
add_timestamps: bool = False,
|
281
|
-
|
282
|
-
|
372
|
+
add_phoneme_timestamps: bool = False,
|
373
|
+
use_original_timestamps: bool = False,
|
374
|
+
):
|
283
375
|
"""See :meth:`_WebSocket.send` for details."""
|
284
376
|
if context_id is None:
|
285
377
|
context_id = str(uuid.uuid4())
|
@@ -290,14 +382,14 @@ class _AsyncWebSocket(_WebSocket):
|
|
290
382
|
model_id=model_id,
|
291
383
|
transcript=transcript,
|
292
384
|
output_format=output_format,
|
293
|
-
|
294
|
-
voice_embedding=voice_embedding,
|
385
|
+
voice=voice,
|
295
386
|
context_id=context_id,
|
296
387
|
duration=duration,
|
297
388
|
language=language,
|
298
389
|
continue_=False,
|
299
390
|
add_timestamps=add_timestamps,
|
300
|
-
|
391
|
+
add_phoneme_timestamps=add_phoneme_timestamps,
|
392
|
+
use_original_timestamps=use_original_timestamps,
|
301
393
|
)
|
302
394
|
|
303
395
|
generator = ctx.receive()
|
@@ -305,18 +397,49 @@ class _AsyncWebSocket(_WebSocket):
|
|
305
397
|
if stream:
|
306
398
|
return generator
|
307
399
|
|
308
|
-
chunks = []
|
309
|
-
|
400
|
+
chunks: typing.List[str] = []
|
401
|
+
words: typing.List[str] = []
|
402
|
+
start: typing.List[float] = []
|
403
|
+
end: typing.List[float] = []
|
404
|
+
phonemes: typing.List[str] = []
|
405
|
+
phoneme_start: typing.List[float] = []
|
406
|
+
phoneme_end: typing.List[float] = []
|
310
407
|
async for chunk in generator:
|
311
|
-
if
|
312
|
-
chunks.append(chunk
|
313
|
-
if add_timestamps and
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
408
|
+
if chunk.audio is not None:
|
409
|
+
chunks.append(chunk.audio)
|
410
|
+
if add_timestamps and chunk.word_timestamps is not None:
|
411
|
+
if chunk.word_timestamps is not None:
|
412
|
+
words.extend(chunk.word_timestamps.words)
|
413
|
+
start.extend(chunk.word_timestamps.start)
|
414
|
+
end.extend(chunk.word_timestamps.end)
|
415
|
+
if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
|
416
|
+
if chunk.phoneme_timestamps is not None:
|
417
|
+
phonemes.extend(chunk.phoneme_timestamps.phonemes)
|
418
|
+
phoneme_start.extend(chunk.phoneme_timestamps.start)
|
419
|
+
phoneme_end.extend(chunk.phoneme_timestamps.end)
|
420
|
+
|
421
|
+
return WebSocketTtsOutput(
|
422
|
+
audio=b"".join(chunks), # type: ignore
|
423
|
+
context_id=context_id,
|
424
|
+
word_timestamps=(
|
425
|
+
WordTimestamps(
|
426
|
+
words=words,
|
427
|
+
start=start,
|
428
|
+
end=end,
|
429
|
+
)
|
430
|
+
if add_timestamps
|
431
|
+
else None
|
432
|
+
),
|
433
|
+
phoneme_timestamps=(
|
434
|
+
PhonemeTimestamps(
|
435
|
+
phonemes=phonemes,
|
436
|
+
start=phoneme_start,
|
437
|
+
end=phoneme_end,
|
438
|
+
)
|
439
|
+
if add_phoneme_timestamps
|
440
|
+
else None
|
441
|
+
),
|
442
|
+
)
|
320
443
|
|
321
444
|
async def _process_responses(self):
|
322
445
|
try:
|
@@ -332,12 +455,14 @@ class _AsyncWebSocket(_WebSocket):
|
|
332
455
|
raise e
|
333
456
|
|
334
457
|
async def _get_message(
|
335
|
-
self, context_id: str, timeout: float, flush_id:
|
458
|
+
self, context_id: str, timeout: float, flush_id: int = -1
|
336
459
|
) -> Dict[str, Any]:
|
337
460
|
if context_id not in self._context_queues:
|
338
461
|
raise ValueError(f"Context ID {context_id} not found.")
|
339
462
|
if len(self._context_queues[context_id]) <= flush_id:
|
340
|
-
raise ValueError(
|
463
|
+
raise ValueError(
|
464
|
+
f"Flush ID {flush_id} not found for context ID {context_id}."
|
465
|
+
)
|
341
466
|
return await asyncio.wait_for(
|
342
467
|
self._context_queues[context_id][flush_id].get(), timeout=timeout
|
343
468
|
)
|
@@ -350,9 +475,11 @@ class _AsyncWebSocket(_WebSocket):
|
|
350
475
|
if self._processing_task is None or self._processing_task.done():
|
351
476
|
self._processing_task = asyncio.create_task(self._process_responses())
|
352
477
|
|
353
|
-
def context(self, context_id: Optional[str] = None)
|
478
|
+
def context(self, context_id: Optional[str] = None):
|
354
479
|
if context_id in self._context_queues:
|
355
|
-
raise ValueError(
|
480
|
+
raise ValueError(
|
481
|
+
f"AsyncContext for context ID {context_id} already exists."
|
482
|
+
)
|
356
483
|
if context_id is None:
|
357
484
|
context_id = str(uuid.uuid4())
|
358
485
|
if context_id not in self._context_queues:
|