cartesia 1.3.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. cartesia/__init__.py +302 -3
  2. cartesia/api_status/__init__.py +6 -0
  3. cartesia/api_status/client.py +104 -0
  4. cartesia/api_status/requests/__init__.py +5 -0
  5. cartesia/api_status/requests/api_info.py +8 -0
  6. cartesia/api_status/types/__init__.py +5 -0
  7. cartesia/api_status/types/api_info.py +20 -0
  8. cartesia/base_client.py +156 -0
  9. cartesia/client.py +163 -40
  10. cartesia/core/__init__.py +50 -0
  11. cartesia/core/api_error.py +15 -0
  12. cartesia/core/client_wrapper.py +55 -0
  13. cartesia/core/datetime_utils.py +28 -0
  14. cartesia/core/file.py +67 -0
  15. cartesia/core/http_client.py +499 -0
  16. cartesia/core/jsonable_encoder.py +101 -0
  17. cartesia/core/pagination.py +88 -0
  18. cartesia/core/pydantic_utilities.py +296 -0
  19. cartesia/core/query_encoder.py +58 -0
  20. cartesia/core/remove_none_from_dict.py +11 -0
  21. cartesia/core/request_options.py +35 -0
  22. cartesia/core/serialization.py +272 -0
  23. cartesia/datasets/__init__.py +24 -0
  24. cartesia/datasets/requests/__init__.py +15 -0
  25. cartesia/datasets/requests/create_dataset_request.py +7 -0
  26. cartesia/datasets/requests/dataset.py +9 -0
  27. cartesia/datasets/requests/dataset_file.py +9 -0
  28. cartesia/datasets/requests/paginated_dataset_files.py +10 -0
  29. cartesia/datasets/requests/paginated_datasets.py +10 -0
  30. cartesia/datasets/types/__init__.py +17 -0
  31. cartesia/datasets/types/create_dataset_request.py +19 -0
  32. cartesia/datasets/types/dataset.py +21 -0
  33. cartesia/datasets/types/dataset_file.py +21 -0
  34. cartesia/datasets/types/file_purpose.py +5 -0
  35. cartesia/datasets/types/paginated_dataset_files.py +21 -0
  36. cartesia/datasets/types/paginated_datasets.py +21 -0
  37. cartesia/embedding/__init__.py +5 -0
  38. cartesia/embedding/types/__init__.py +5 -0
  39. cartesia/embedding/types/embedding.py +201 -0
  40. cartesia/environment.py +7 -0
  41. cartesia/infill/__init__.py +2 -0
  42. cartesia/infill/client.py +318 -0
  43. cartesia/tts/__init__.py +167 -0
  44. cartesia/{_async_websocket.py → tts/_async_websocket.py} +212 -85
  45. cartesia/tts/_websocket.py +479 -0
  46. cartesia/tts/client.py +407 -0
  47. cartesia/tts/requests/__init__.py +76 -0
  48. cartesia/tts/requests/cancel_context_request.py +17 -0
  49. cartesia/tts/requests/controls.py +11 -0
  50. cartesia/tts/requests/generation_request.py +58 -0
  51. cartesia/tts/requests/mp_3_output_format.py +11 -0
  52. cartesia/tts/requests/output_format.py +30 -0
  53. cartesia/tts/requests/phoneme_timestamps.py +10 -0
  54. cartesia/tts/requests/raw_output_format.py +11 -0
  55. cartesia/tts/requests/speed.py +7 -0
  56. cartesia/tts/requests/tts_request.py +24 -0
  57. cartesia/tts/requests/tts_request_embedding_specifier.py +16 -0
  58. cartesia/tts/requests/tts_request_id_specifier.py +16 -0
  59. cartesia/tts/requests/tts_request_voice_specifier.py +7 -0
  60. cartesia/tts/requests/wav_output_format.py +7 -0
  61. cartesia/tts/requests/web_socket_base_response.py +11 -0
  62. cartesia/tts/requests/web_socket_chunk_response.py +11 -0
  63. cartesia/tts/requests/web_socket_done_response.py +7 -0
  64. cartesia/tts/requests/web_socket_error_response.py +7 -0
  65. cartesia/tts/requests/web_socket_flush_done_response.py +9 -0
  66. cartesia/tts/requests/web_socket_phoneme_timestamps_response.py +9 -0
  67. cartesia/tts/requests/web_socket_raw_output_format.py +11 -0
  68. cartesia/tts/requests/web_socket_request.py +7 -0
  69. cartesia/tts/requests/web_socket_response.py +70 -0
  70. cartesia/tts/requests/web_socket_stream_options.py +8 -0
  71. cartesia/tts/requests/web_socket_timestamps_response.py +9 -0
  72. cartesia/tts/requests/web_socket_tts_output.py +18 -0
  73. cartesia/tts/requests/web_socket_tts_request.py +25 -0
  74. cartesia/tts/requests/word_timestamps.py +10 -0
  75. cartesia/tts/socket_client.py +302 -0
  76. cartesia/tts/types/__init__.py +90 -0
  77. cartesia/tts/types/cancel_context_request.py +28 -0
  78. cartesia/tts/types/context_id.py +3 -0
  79. cartesia/tts/types/controls.py +22 -0
  80. cartesia/tts/types/emotion.py +34 -0
  81. cartesia/tts/types/flush_id.py +3 -0
  82. cartesia/tts/types/generation_request.py +71 -0
  83. cartesia/tts/types/mp_3_output_format.py +23 -0
  84. cartesia/tts/types/natural_specifier.py +5 -0
  85. cartesia/tts/types/numerical_specifier.py +3 -0
  86. cartesia/tts/types/output_format.py +58 -0
  87. cartesia/tts/types/phoneme_timestamps.py +21 -0
  88. cartesia/tts/types/raw_encoding.py +5 -0
  89. cartesia/tts/types/raw_output_format.py +22 -0
  90. cartesia/tts/types/speed.py +7 -0
  91. cartesia/tts/types/supported_language.py +7 -0
  92. cartesia/tts/types/tts_request.py +35 -0
  93. cartesia/tts/types/tts_request_embedding_specifier.py +27 -0
  94. cartesia/tts/types/tts_request_id_specifier.py +27 -0
  95. cartesia/tts/types/tts_request_voice_specifier.py +7 -0
  96. cartesia/tts/types/wav_output_format.py +17 -0
  97. cartesia/tts/types/web_socket_base_response.py +22 -0
  98. cartesia/tts/types/web_socket_chunk_response.py +22 -0
  99. cartesia/tts/types/web_socket_done_response.py +17 -0
  100. cartesia/tts/types/web_socket_error_response.py +19 -0
  101. cartesia/tts/types/web_socket_flush_done_response.py +21 -0
  102. cartesia/tts/types/web_socket_phoneme_timestamps_response.py +20 -0
  103. cartesia/tts/types/web_socket_raw_output_format.py +22 -0
  104. cartesia/tts/types/web_socket_request.py +7 -0
  105. cartesia/tts/types/web_socket_response.py +125 -0
  106. cartesia/tts/types/web_socket_stream_options.py +19 -0
  107. cartesia/tts/types/web_socket_timestamps_response.py +20 -0
  108. cartesia/tts/types/web_socket_tts_output.py +29 -0
  109. cartesia/tts/types/web_socket_tts_request.py +37 -0
  110. cartesia/tts/types/word_timestamps.py +21 -0
  111. cartesia/{_constants.py → tts/utils/constants.py} +2 -2
  112. cartesia/tts/utils/tts.py +64 -0
  113. cartesia/tts/utils/types.py +70 -0
  114. cartesia/version.py +3 -1
  115. cartesia/voice_changer/__init__.py +27 -0
  116. cartesia/voice_changer/client.py +395 -0
  117. cartesia/voice_changer/requests/__init__.py +15 -0
  118. cartesia/voice_changer/requests/streaming_response.py +38 -0
  119. cartesia/voice_changer/types/__init__.py +17 -0
  120. cartesia/voice_changer/types/output_format_container.py +5 -0
  121. cartesia/voice_changer/types/streaming_response.py +64 -0
  122. cartesia/voices/__init__.py +81 -0
  123. cartesia/voices/client.py +1218 -0
  124. cartesia/voices/requests/__init__.py +29 -0
  125. cartesia/voices/requests/create_voice_request.py +23 -0
  126. cartesia/voices/requests/embedding_response.py +8 -0
  127. cartesia/voices/requests/embedding_specifier.py +10 -0
  128. cartesia/voices/requests/get_voices_response.py +24 -0
  129. cartesia/voices/requests/id_specifier.py +10 -0
  130. cartesia/voices/requests/localize_dialect.py +11 -0
  131. cartesia/voices/requests/localize_voice_request.py +28 -0
  132. cartesia/voices/requests/mix_voice_specifier.py +7 -0
  133. cartesia/voices/requests/mix_voices_request.py +9 -0
  134. cartesia/voices/requests/update_voice_request.py +15 -0
  135. cartesia/voices/requests/voice.py +43 -0
  136. cartesia/voices/requests/voice_metadata.py +36 -0
  137. cartesia/voices/types/__init__.py +53 -0
  138. cartesia/voices/types/base_voice_id.py +5 -0
  139. cartesia/voices/types/clone_mode.py +5 -0
  140. cartesia/voices/types/create_voice_request.py +34 -0
  141. cartesia/voices/types/embedding_response.py +20 -0
  142. cartesia/voices/types/embedding_specifier.py +22 -0
  143. cartesia/voices/types/gender.py +5 -0
  144. cartesia/voices/types/gender_presentation.py +5 -0
  145. cartesia/voices/types/get_voices_response.py +34 -0
  146. cartesia/voices/types/id_specifier.py +22 -0
  147. cartesia/voices/types/localize_dialect.py +11 -0
  148. cartesia/voices/types/localize_english_dialect.py +5 -0
  149. cartesia/voices/types/localize_french_dialect.py +5 -0
  150. cartesia/voices/types/localize_portuguese_dialect.py +5 -0
  151. cartesia/voices/types/localize_spanish_dialect.py +5 -0
  152. cartesia/voices/types/localize_target_language.py +7 -0
  153. cartesia/voices/types/localize_voice_request.py +39 -0
  154. cartesia/voices/types/mix_voice_specifier.py +7 -0
  155. cartesia/voices/types/mix_voices_request.py +20 -0
  156. cartesia/voices/types/update_voice_request.py +27 -0
  157. cartesia/voices/types/voice.py +54 -0
  158. cartesia/voices/types/voice_expand_options.py +5 -0
  159. cartesia/voices/types/voice_id.py +3 -0
  160. cartesia/voices/types/voice_metadata.py +48 -0
  161. cartesia/voices/types/weight.py +3 -0
  162. cartesia-2.0.0.dist-info/METADATA +414 -0
  163. cartesia-2.0.0.dist-info/RECORD +165 -0
  164. {cartesia-1.3.1.dist-info → cartesia-2.0.0.dist-info}/WHEEL +1 -1
  165. cartesia/_async_sse.py +0 -95
  166. cartesia/_logger.py +0 -3
  167. cartesia/_sse.py +0 -143
  168. cartesia/_types.py +0 -70
  169. cartesia/_websocket.py +0 -358
  170. cartesia/async_client.py +0 -82
  171. cartesia/async_tts.py +0 -63
  172. cartesia/resource.py +0 -44
  173. cartesia/tts.py +0 -137
  174. cartesia/utils/deprecated.py +0 -55
  175. cartesia/utils/retry.py +0 -87
  176. cartesia/utils/tts.py +0 -78
  177. cartesia/voices.py +0 -208
  178. cartesia-1.3.1.dist-info/METADATA +0 -661
  179. cartesia-1.3.1.dist-info/RECORD +0 -23
  180. cartesia-1.3.1.dist-info/licenses/LICENSE.md +0 -21
  181. /cartesia/{utils/__init__.py → py.typed} +0 -0
@@ -1,4 +1,6 @@
1
1
  import asyncio
2
+ import json
3
+ import typing
2
4
  import uuid
3
5
  from collections import defaultdict
4
6
  from types import TracebackType
@@ -6,11 +8,27 @@ from typing import Any, AsyncGenerator, Callable, Dict, List, Optional, Union
6
8
 
7
9
  import aiohttp
8
10
 
9
- from cartesia._constants import DEFAULT_MODEL_ID, DEFAULT_OUTPUT_FORMAT, DEFAULT_VOICE_EMBEDDING
10
- from cartesia._types import OutputFormat, VoiceControls
11
- from cartesia._websocket import _WebSocket
12
- from cartesia.tts import TTS
13
- from cartesia.utils.tts import _construct_tts_request
11
+ from cartesia.tts.requests import TtsRequestVoiceSpecifierParams
12
+ from cartesia.tts.requests.output_format import OutputFormatParams
13
+ from cartesia.tts.types import (
14
+ WebSocketResponse,
15
+ WebSocketResponse_Done,
16
+ WebSocketResponse_Error,
17
+ WebSocketResponse_FlushDone,
18
+ WebSocketTtsOutput,
19
+ WordTimestamps,
20
+ PhonemeTimestamps,
21
+ )
22
+
23
+ from ..core.pydantic_utilities import parse_obj_as
24
+ from ._websocket import TtsWebsocket
25
+ from .types.generation_request import GenerationRequest
26
+ from .utils.constants import (
27
+ DEFAULT_MODEL_ID,
28
+ DEFAULT_OUTPUT_FORMAT,
29
+ DEFAULT_VOICE_EMBEDDING,
30
+ )
31
+ from .utils.tts import get_output_format
14
32
 
15
33
 
16
34
  class _AsyncTTSContext:
@@ -26,7 +44,9 @@ class _AsyncTTSContext:
26
44
 
27
45
  """
28
46
 
29
- def __init__(self, context_id: str, websocket: "_AsyncWebSocket", timeout: float):
47
+ def __init__(
48
+ self, context_id: str, websocket: "AsyncTtsWebsocket", timeout: float = 30
49
+ ):
30
50
  self._context_id = context_id
31
51
  self._websocket = websocket
32
52
  self.timeout = timeout
@@ -38,60 +58,79 @@ class _AsyncTTSContext:
38
58
 
39
59
  async def send(
40
60
  self,
61
+ *,
41
62
  model_id: str,
42
63
  transcript: str,
43
- output_format: OutputFormat,
44
- voice_id: Optional[str] = None,
45
- voice_embedding: Optional[List[float]] = None,
64
+ output_format: OutputFormatParams,
65
+ voice: TtsRequestVoiceSpecifierParams,
46
66
  context_id: Optional[str] = None,
47
- continue_: bool = False,
48
- flush: bool = False,
49
67
  duration: Optional[int] = None,
50
68
  language: Optional[str] = None,
69
+ stream: bool = True,
51
70
  add_timestamps: bool = False,
52
- _experimental_voice_controls: Optional[VoiceControls] = None,
71
+ add_phoneme_timestamps: bool = False,
72
+ use_original_timestamps: bool = False,
73
+ continue_: bool = False,
74
+ flush: bool = False,
53
75
  ) -> None:
54
76
  """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
55
77
 
56
78
  Args:
57
- model_id: The ID of the model to use for generating audio.
58
- transcript: The text to convert to speech.
59
- output_format: A dictionary containing the details of the output format.
60
- voice_id: The ID of the voice to use for generating audio.
61
- voice_embedding: The embedding of the voice to use for generating audio.
62
- context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
63
- continue_: Whether to continue the audio generation from the previous transcript or not.
64
- flush: Whether to trigger a manual flush for the current context's generation.
65
- duration: The duration of the audio in seconds.
66
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
67
- add_timestamps: Whether to return word-level timestamps.
68
- _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
69
- Note: This is an experimental feature and may change rapidly in future releases.
79
+ request: The request to generate audio.
70
80
 
71
81
  Returns:
72
82
  None.
73
83
  """
74
- if context_id is not None and context_id != self._context_id:
75
- raise ValueError("Context ID does not match the context ID of the current context.")
76
- if continue_ and transcript == "" and not flush:
77
- raise ValueError("Transcript cannot be empty when continue_ is True.")
78
-
79
84
  await self._websocket.connect()
80
-
81
- request_body = _construct_tts_request(
82
- model_id=model_id,
83
- transcript=transcript,
84
- output_format=output_format,
85
- voice_id=voice_id,
86
- voice_embedding=voice_embedding,
87
- duration=duration,
88
- language=language,
89
- context_id=self._context_id,
90
- add_timestamps=add_timestamps,
91
- continue_=continue_,
92
- flush=flush,
93
- _experimental_voice_controls=_experimental_voice_controls,
94
- )
85
+ assert self._websocket.websocket is not None, "WebSocket is not connected"
86
+
87
+ request_body = {
88
+ "model_id": model_id,
89
+ "transcript": transcript,
90
+ "output_format": (
91
+ output_format
92
+ if isinstance(output_format, dict)
93
+ else output_format.dict()
94
+ ),
95
+ "voice": (voice if isinstance(voice, dict) else voice.dict()),
96
+ "context_id": self._context_id,
97
+ }
98
+ if context_id is not None:
99
+ request_body["context_id"] = context_id
100
+ if duration is not None:
101
+ request_body["duration"] = duration
102
+ if language is not None:
103
+ request_body["language"] = language
104
+ if stream:
105
+ request_body["stream"] = stream
106
+ if add_timestamps:
107
+ request_body["add_timestamps"] = add_timestamps
108
+ if add_phoneme_timestamps:
109
+ request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
110
+ if use_original_timestamps:
111
+ request_body["use_original_timestamps"] = use_original_timestamps
112
+ if continue_:
113
+ request_body["continue"] = continue_
114
+ if flush:
115
+ request_body["flush"] = flush
116
+
117
+ if (
118
+ "context_id" in request_body
119
+ and request_body["context_id"] is not None
120
+ and request_body["context_id"] != self._context_id
121
+ ):
122
+ raise ValueError(
123
+ "Context ID does not match the context ID of the current context."
124
+ )
125
+ request_body["context_id"] = self._context_id
126
+
127
+ if (
128
+ "continue" in request_body
129
+ and request_body["continue"]
130
+ and request_body["transcript"] == ""
131
+ and ("flush" in request_body and not request_body["flush"])
132
+ ):
133
+ raise ValueError("Transcript cannot be empty when continue_ is True.")
95
134
 
96
135
  await self._websocket.websocket.send_json(request_body)
97
136
 
@@ -103,8 +142,11 @@ class _AsyncTTSContext:
103
142
  await self.send(
104
143
  model_id=DEFAULT_MODEL_ID,
105
144
  transcript="",
106
- output_format=TTS.get_output_format(DEFAULT_OUTPUT_FORMAT),
107
- voice_embedding=DEFAULT_VOICE_EMBEDDING, # Default voice embedding since it's a required input for now.
145
+ output_format=get_output_format(DEFAULT_OUTPUT_FORMAT),
146
+ voice={
147
+ "mode": "embedding",
148
+ "embedding": DEFAULT_VOICE_EMBEDDING,
149
+ },
108
150
  context_id=self._context_id,
109
151
  continue_=False,
110
152
  )
@@ -114,8 +156,11 @@ class _AsyncTTSContext:
114
156
  await self.send(
115
157
  model_id=DEFAULT_MODEL_ID,
116
158
  transcript="",
117
- output_format=TTS.get_output_format(DEFAULT_OUTPUT_FORMAT),
118
- voice_embedding=DEFAULT_VOICE_EMBEDDING, # Default voice embedding since it's a required input for now.
159
+ output_format=get_output_format(DEFAULT_OUTPUT_FORMAT),
160
+ voice={
161
+ "mode": "embedding",
162
+ "embedding": DEFAULT_VOICE_EMBEDDING,
163
+ },
119
164
  context_id=self._context_id,
120
165
  continue_=True,
121
166
  flush=True,
@@ -134,11 +179,23 @@ class _AsyncTTSContext:
134
179
  response = await self._websocket._get_message(
135
180
  self._context_id, timeout=self.timeout, flush_id=flush_id
136
181
  )
137
- if "error" in response:
138
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
139
- if response.get("flush_done") or response["done"]:
182
+ response_obj = typing.cast(
183
+ WebSocketResponse,
184
+ parse_obj_as(
185
+ type_=WebSocketResponse, object_=response # type: ignore
186
+ ),
187
+ )
188
+ if isinstance(response_obj, WebSocketResponse_Error):
189
+ raise RuntimeError(
190
+ f"Error generating audio:\n{response_obj.error}"
191
+ )
192
+ if isinstance(response_obj, WebSocketResponse_Done) or isinstance(
193
+ response_obj, WebSocketResponse_FlushDone
194
+ ):
140
195
  break
141
- yield self._websocket._convert_response(response, include_context_id=True)
196
+ yield self._websocket._convert_response(
197
+ response_obj, include_context_id=True
198
+ )
142
199
  except Exception as e:
143
200
  if isinstance(e, asyncio.TimeoutError):
144
201
  raise RuntimeError("Timeout while waiting for audio chunk")
@@ -146,7 +203,7 @@ class _AsyncTTSContext:
146
203
 
147
204
  return generator
148
205
 
149
- async def receive(self) -> AsyncGenerator[Dict[str, Any], None]:
206
+ async def receive(self) -> AsyncGenerator[WebSocketTtsOutput, None]:
150
207
  """Receive the audio chunks from the WebSocket. This method is a generator that yields audio chunks.
151
208
 
152
209
  Returns:
@@ -157,11 +214,21 @@ class _AsyncTTSContext:
157
214
  response = await self._websocket._get_message(
158
215
  self._context_id, timeout=self.timeout
159
216
  )
160
- if "error" in response:
161
- raise RuntimeError(f"Error generating audio:\n{response['error']}")
162
- if response["done"]:
217
+ response_obj = typing.cast(
218
+ WebSocketResponse,
219
+ parse_obj_as(
220
+ type_=WebSocketResponse, # type: ignore
221
+ object_=response,
222
+ ),
223
+ )
224
+
225
+ if isinstance(response_obj, WebSocketResponse_Error):
226
+ raise RuntimeError(f"Error generating audio:\n{response_obj.error}")
227
+ if isinstance(response_obj, WebSocketResponse_Done):
163
228
  break
164
- yield self._websocket._convert_response(response, include_context_id=True)
229
+ yield self._websocket._convert_response(
230
+ response_obj, include_context_id=True
231
+ )
165
232
  except Exception as e:
166
233
  if isinstance(e, asyncio.TimeoutError):
167
234
  raise RuntimeError("Timeout while waiting for audio chunk")
@@ -169,6 +236,11 @@ class _AsyncTTSContext:
169
236
  finally:
170
237
  self._close()
171
238
 
239
+ async def cancel(self):
240
+ """Cancel the context. This will stop the generation of audio for this context."""
241
+ await self._websocket.websocket.send_json({"context_id": self._context_id, "cancel": True})
242
+ self._close()
243
+
172
244
  def _close(self) -> None:
173
245
  """Closes the context. Automatically called when a done message is received for this context."""
174
246
  self._websocket._remove_context(self._context_id)
@@ -192,7 +264,7 @@ class _AsyncTTSContext:
192
264
  self._close()
193
265
 
194
266
 
195
- class _AsyncWebSocket(_WebSocket):
267
+ class AsyncTtsWebsocket(TtsWebsocket):
196
268
  """This class contains methods to generate audio using WebSocket asynchronously."""
197
269
 
198
270
  def __init__(
@@ -200,8 +272,8 @@ class _AsyncWebSocket(_WebSocket):
200
272
  ws_url: str,
201
273
  api_key: str,
202
274
  cartesia_version: str,
203
- timeout: float,
204
275
  get_session: Callable[[], Optional[aiohttp.ClientSession]],
276
+ timeout: float = 30,
205
277
  ):
206
278
  """
207
279
  Args:
@@ -216,7 +288,7 @@ class _AsyncWebSocket(_WebSocket):
216
288
  self._get_session = get_session
217
289
  self.websocket = None
218
290
  self._context_queues: Dict[str, List[asyncio.Queue]] = {}
219
- self._processing_task: asyncio.Task = None
291
+ self._processing_task: Optional[asyncio.Task] = None
220
292
 
221
293
  def __del__(self):
222
294
  try:
@@ -237,7 +309,26 @@ class _AsyncWebSocket(_WebSocket):
237
309
  try:
238
310
  self.websocket = await session.ws_connect(url)
239
311
  except Exception as e:
240
- raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
312
+ # Extract status code if available
313
+ status_code = None
314
+ error_message = str(e)
315
+
316
+ if hasattr(e, 'status') and e.status is not None:
317
+ status_code = e.status
318
+
319
+ # Create a meaningful error message based on status code
320
+ if status_code == 402:
321
+ error_message = "Payment required. Your API key may have insufficient credits or permissions."
322
+ elif status_code == 401:
323
+ error_message = "Unauthorized. Please check your API key."
324
+ elif status_code == 403:
325
+ error_message = "Forbidden. You don't have permission to access this resource."
326
+ elif status_code == 404:
327
+ error_message = "Not found. The requested resource doesn't exist."
328
+
329
+ raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
330
+ else:
331
+ raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
241
332
 
242
333
  def _is_websocket_closed(self):
243
334
  return self.websocket.closed
@@ -268,18 +359,19 @@ class _AsyncWebSocket(_WebSocket):
268
359
 
269
360
  async def send(
270
361
  self,
362
+ *,
271
363
  model_id: str,
272
364
  transcript: str,
273
- output_format: OutputFormat,
274
- voice_id: Optional[str] = None,
275
- voice_embedding: Optional[List[float]] = None,
365
+ output_format: OutputFormatParams,
366
+ voice: TtsRequestVoiceSpecifierParams,
276
367
  context_id: Optional[str] = None,
277
368
  duration: Optional[int] = None,
278
369
  language: Optional[str] = None,
279
370
  stream: bool = True,
280
371
  add_timestamps: bool = False,
281
- _experimental_voice_controls: Optional[VoiceControls] = None,
282
- ) -> Union[bytes, AsyncGenerator[bytes, None]]:
372
+ add_phoneme_timestamps: bool = False,
373
+ use_original_timestamps: bool = False,
374
+ ):
283
375
  """See :meth:`_WebSocket.send` for details."""
284
376
  if context_id is None:
285
377
  context_id = str(uuid.uuid4())
@@ -290,14 +382,14 @@ class _AsyncWebSocket(_WebSocket):
290
382
  model_id=model_id,
291
383
  transcript=transcript,
292
384
  output_format=output_format,
293
- voice_id=voice_id,
294
- voice_embedding=voice_embedding,
385
+ voice=voice,
295
386
  context_id=context_id,
296
387
  duration=duration,
297
388
  language=language,
298
389
  continue_=False,
299
390
  add_timestamps=add_timestamps,
300
- _experimental_voice_controls=_experimental_voice_controls,
391
+ add_phoneme_timestamps=add_phoneme_timestamps,
392
+ use_original_timestamps=use_original_timestamps,
301
393
  )
302
394
 
303
395
  generator = ctx.receive()
@@ -305,18 +397,49 @@ class _AsyncWebSocket(_WebSocket):
305
397
  if stream:
306
398
  return generator
307
399
 
308
- chunks = []
309
- word_timestamps = defaultdict(list)
400
+ chunks: typing.List[str] = []
401
+ words: typing.List[str] = []
402
+ start: typing.List[float] = []
403
+ end: typing.List[float] = []
404
+ phonemes: typing.List[str] = []
405
+ phoneme_start: typing.List[float] = []
406
+ phoneme_end: typing.List[float] = []
310
407
  async for chunk in generator:
311
- if "audio" in chunk:
312
- chunks.append(chunk["audio"])
313
- if add_timestamps and "word_timestamps" in chunk:
314
- for k, v in chunk["word_timestamps"].items():
315
- word_timestamps[k].extend(v)
316
- out = {"audio": b"".join(chunks), "context_id": context_id}
317
- if add_timestamps:
318
- out["word_timestamps"] = word_timestamps
319
- return out
408
+ if chunk.audio is not None:
409
+ chunks.append(chunk.audio)
410
+ if add_timestamps and chunk.word_timestamps is not None:
411
+ if chunk.word_timestamps is not None:
412
+ words.extend(chunk.word_timestamps.words)
413
+ start.extend(chunk.word_timestamps.start)
414
+ end.extend(chunk.word_timestamps.end)
415
+ if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
416
+ if chunk.phoneme_timestamps is not None:
417
+ phonemes.extend(chunk.phoneme_timestamps.phonemes)
418
+ phoneme_start.extend(chunk.phoneme_timestamps.start)
419
+ phoneme_end.extend(chunk.phoneme_timestamps.end)
420
+
421
+ return WebSocketTtsOutput(
422
+ audio=b"".join(chunks), # type: ignore
423
+ context_id=context_id,
424
+ word_timestamps=(
425
+ WordTimestamps(
426
+ words=words,
427
+ start=start,
428
+ end=end,
429
+ )
430
+ if add_timestamps
431
+ else None
432
+ ),
433
+ phoneme_timestamps=(
434
+ PhonemeTimestamps(
435
+ phonemes=phonemes,
436
+ start=phoneme_start,
437
+ end=phoneme_end,
438
+ )
439
+ if add_phoneme_timestamps
440
+ else None
441
+ ),
442
+ )
320
443
 
321
444
  async def _process_responses(self):
322
445
  try:
@@ -332,12 +455,14 @@ class _AsyncWebSocket(_WebSocket):
332
455
  raise e
333
456
 
334
457
  async def _get_message(
335
- self, context_id: str, timeout: float, flush_id: Optional[int] = -1
458
+ self, context_id: str, timeout: float, flush_id: int = -1
336
459
  ) -> Dict[str, Any]:
337
460
  if context_id not in self._context_queues:
338
461
  raise ValueError(f"Context ID {context_id} not found.")
339
462
  if len(self._context_queues[context_id]) <= flush_id:
340
- raise ValueError(f"Flush ID {flush_id} not found for context ID {context_id}.")
463
+ raise ValueError(
464
+ f"Flush ID {flush_id} not found for context ID {context_id}."
465
+ )
341
466
  return await asyncio.wait_for(
342
467
  self._context_queues[context_id][flush_id].get(), timeout=timeout
343
468
  )
@@ -350,9 +475,11 @@ class _AsyncWebSocket(_WebSocket):
350
475
  if self._processing_task is None or self._processing_task.done():
351
476
  self._processing_task = asyncio.create_task(self._process_responses())
352
477
 
353
- def context(self, context_id: Optional[str] = None) -> _AsyncTTSContext:
478
+ def context(self, context_id: Optional[str] = None):
354
479
  if context_id in self._context_queues:
355
- raise ValueError(f"AsyncContext for context ID {context_id} already exists.")
480
+ raise ValueError(
481
+ f"AsyncContext for context ID {context_id} already exists."
482
+ )
356
483
  if context_id is None:
357
484
  context_id = str(uuid.uuid4())
358
485
  if context_id not in self._context_queues: