cartesia 2.0.0b2__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +10 -0
- cartesia/base_client.py +0 -4
- cartesia/core/__init__.py +3 -0
- cartesia/core/client_wrapper.py +2 -2
- cartesia/core/pagination.py +88 -0
- cartesia/infill/client.py +4 -4
- cartesia/tts/_async_websocket.py +53 -1
- cartesia/tts/_websocket.py +52 -3
- cartesia/tts/client.py +4 -4
- cartesia/tts/requests/generation_request.py +5 -0
- cartesia/tts/requests/web_socket_chunk_response.py +3 -0
- cartesia/tts/requests/web_socket_response.py +2 -1
- cartesia/tts/requests/web_socket_tts_request.py +1 -0
- cartesia/tts/types/emotion.py +5 -0
- cartesia/tts/types/generation_request.py +5 -0
- cartesia/tts/types/web_socket_chunk_response.py +3 -1
- cartesia/tts/types/web_socket_response.py +2 -1
- cartesia/tts/types/web_socket_tts_output.py +2 -0
- cartesia/tts/types/web_socket_tts_request.py +1 -0
- cartesia/tts/utils/constants.py +2 -2
- cartesia/voice_changer/requests/streaming_response.py +2 -0
- cartesia/voice_changer/types/streaming_response.py +2 -0
- cartesia/voices/__init__.py +10 -0
- cartesia/voices/client.py +209 -44
- cartesia/voices/requests/__init__.py +2 -0
- cartesia/voices/requests/get_voices_response.py +24 -0
- cartesia/voices/requests/localize_dialect.py +4 -1
- cartesia/voices/requests/localize_voice_request.py +15 -2
- cartesia/voices/requests/voice.py +13 -9
- cartesia/voices/types/__init__.py +8 -0
- cartesia/voices/types/gender_presentation.py +5 -0
- cartesia/voices/types/get_voices_response.py +34 -0
- cartesia/voices/types/localize_dialect.py +4 -1
- cartesia/voices/types/localize_french_dialect.py +5 -0
- cartesia/voices/types/localize_voice_request.py +16 -3
- cartesia/voices/types/voice.py +13 -9
- cartesia/voices/types/voice_expand_options.py +5 -0
- {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/METADATA +149 -73
- {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/RECORD +40 -35
- cartesia/datasets/client.py +0 -392
- {cartesia-2.0.0b2.dist-info → cartesia-2.0.0b8.dist-info}/WHEEL +0 -0
cartesia/__init__.py
CHANGED
@@ -121,11 +121,15 @@ from .voices import (
|
|
121
121
|
EmbeddingSpecifier,
|
122
122
|
EmbeddingSpecifierParams,
|
123
123
|
Gender,
|
124
|
+
GenderPresentation,
|
125
|
+
GetVoicesResponse,
|
126
|
+
GetVoicesResponseParams,
|
124
127
|
IdSpecifier,
|
125
128
|
IdSpecifierParams,
|
126
129
|
LocalizeDialect,
|
127
130
|
LocalizeDialectParams,
|
128
131
|
LocalizeEnglishDialect,
|
132
|
+
LocalizeFrenchDialect,
|
129
133
|
LocalizePortugueseDialect,
|
130
134
|
LocalizeSpanishDialect,
|
131
135
|
LocalizeTargetLanguage,
|
@@ -138,6 +142,7 @@ from .voices import (
|
|
138
142
|
UpdateVoiceRequest,
|
139
143
|
UpdateVoiceRequestParams,
|
140
144
|
Voice,
|
145
|
+
VoiceExpandOptions,
|
141
146
|
VoiceId,
|
142
147
|
VoiceMetadata,
|
143
148
|
VoiceMetadataParams,
|
@@ -175,13 +180,17 @@ __all__ = [
|
|
175
180
|
"FilePurpose",
|
176
181
|
"FlushId",
|
177
182
|
"Gender",
|
183
|
+
"GenderPresentation",
|
178
184
|
"GenerationRequest",
|
179
185
|
"GenerationRequestParams",
|
186
|
+
"GetVoicesResponse",
|
187
|
+
"GetVoicesResponseParams",
|
180
188
|
"IdSpecifier",
|
181
189
|
"IdSpecifierParams",
|
182
190
|
"LocalizeDialect",
|
183
191
|
"LocalizeDialectParams",
|
184
192
|
"LocalizeEnglishDialect",
|
193
|
+
"LocalizeFrenchDialect",
|
185
194
|
"LocalizePortugueseDialect",
|
186
195
|
"LocalizeSpanishDialect",
|
187
196
|
"LocalizeTargetLanguage",
|
@@ -235,6 +244,7 @@ __all__ = [
|
|
235
244
|
"UpdateVoiceRequest",
|
236
245
|
"UpdateVoiceRequestParams",
|
237
246
|
"Voice",
|
247
|
+
"VoiceExpandOptions",
|
238
248
|
"VoiceId",
|
239
249
|
"VoiceMetadata",
|
240
250
|
"VoiceMetadataParams",
|
cartesia/base_client.py
CHANGED
@@ -5,14 +5,12 @@ from .environment import CartesiaEnvironment
|
|
5
5
|
import httpx
|
6
6
|
from .core.client_wrapper import SyncClientWrapper
|
7
7
|
from .api_status.client import ApiStatusClient
|
8
|
-
from .datasets.client import DatasetsClient
|
9
8
|
from .infill.client import InfillClient
|
10
9
|
from .tts.client import TtsClient
|
11
10
|
from .voice_changer.client import VoiceChangerClient
|
12
11
|
from .voices.client import VoicesClient
|
13
12
|
from .core.client_wrapper import AsyncClientWrapper
|
14
13
|
from .api_status.client import AsyncApiStatusClient
|
15
|
-
from .datasets.client import AsyncDatasetsClient
|
16
14
|
from .infill.client import AsyncInfillClient
|
17
15
|
from .tts.client import AsyncTtsClient
|
18
16
|
from .voice_changer.client import AsyncVoiceChangerClient
|
@@ -78,7 +76,6 @@ class BaseCartesia:
|
|
78
76
|
timeout=_defaulted_timeout,
|
79
77
|
)
|
80
78
|
self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
|
81
|
-
self.datasets = DatasetsClient(client_wrapper=self._client_wrapper)
|
82
79
|
self.infill = InfillClient(client_wrapper=self._client_wrapper)
|
83
80
|
self.tts = TtsClient(client_wrapper=self._client_wrapper)
|
84
81
|
self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
|
@@ -144,7 +141,6 @@ class AsyncBaseCartesia:
|
|
144
141
|
timeout=_defaulted_timeout,
|
145
142
|
)
|
146
143
|
self.api_status = AsyncApiStatusClient(client_wrapper=self._client_wrapper)
|
147
|
-
self.datasets = AsyncDatasetsClient(client_wrapper=self._client_wrapper)
|
148
144
|
self.infill = AsyncInfillClient(client_wrapper=self._client_wrapper)
|
149
145
|
self.tts = AsyncTtsClient(client_wrapper=self._client_wrapper)
|
150
146
|
self.voice_changer = AsyncVoiceChangerClient(client_wrapper=self._client_wrapper)
|
cartesia/core/__init__.py
CHANGED
@@ -6,6 +6,7 @@ from .datetime_utils import serialize_datetime
|
|
6
6
|
from .file import File, convert_file_dict_to_httpx_tuples, with_content_type
|
7
7
|
from .http_client import AsyncHttpClient, HttpClient
|
8
8
|
from .jsonable_encoder import jsonable_encoder
|
9
|
+
from .pagination import AsyncPager, SyncPager
|
9
10
|
from .pydantic_utilities import (
|
10
11
|
IS_PYDANTIC_V2,
|
11
12
|
UniversalBaseModel,
|
@@ -24,6 +25,7 @@ __all__ = [
|
|
24
25
|
"ApiError",
|
25
26
|
"AsyncClientWrapper",
|
26
27
|
"AsyncHttpClient",
|
28
|
+
"AsyncPager",
|
27
29
|
"BaseClientWrapper",
|
28
30
|
"FieldMetadata",
|
29
31
|
"File",
|
@@ -31,6 +33,7 @@ __all__ = [
|
|
31
33
|
"IS_PYDANTIC_V2",
|
32
34
|
"RequestOptions",
|
33
35
|
"SyncClientWrapper",
|
36
|
+
"SyncPager",
|
34
37
|
"UniversalBaseModel",
|
35
38
|
"UniversalRootModel",
|
36
39
|
"convert_and_respect_annotation_metadata",
|
cartesia/core/client_wrapper.py
CHANGED
@@ -16,10 +16,10 @@ class BaseClientWrapper:
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
17
17
|
"X-Fern-Language": "Python",
|
18
18
|
"X-Fern-SDK-Name": "cartesia",
|
19
|
-
"X-Fern-SDK-Version": "2.0.
|
19
|
+
"X-Fern-SDK-Version": "2.0.0b8",
|
20
20
|
}
|
21
21
|
headers["X-API-Key"] = self.api_key
|
22
|
-
headers["Cartesia-Version"] = "2024-
|
22
|
+
headers["Cartesia-Version"] = "2024-11-13"
|
23
23
|
return headers
|
24
24
|
|
25
25
|
def get_base_url(self) -> str:
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
|
5
|
+
from typing_extensions import Self
|
6
|
+
|
7
|
+
import pydantic
|
8
|
+
|
9
|
+
# Generic to represent the underlying type of the results within a page
|
10
|
+
T = typing.TypeVar("T")
|
11
|
+
|
12
|
+
|
13
|
+
# SDKs implement a Page ABC per-pagination request, the endpoint then returns a pager that wraps this type
|
14
|
+
# for example, an endpoint will return SyncPager[UserPage] where UserPage implements the Page ABC. ex:
|
15
|
+
#
|
16
|
+
# SyncPager<InnerListType>(
|
17
|
+
# has_next=response.list_metadata.after is not None,
|
18
|
+
# items=response.data,
|
19
|
+
# # This should be the outer function that returns the SyncPager again
|
20
|
+
# get_next=lambda: list(..., cursor: response.cursor) (or list(..., offset: offset + 1))
|
21
|
+
# )
|
22
|
+
class BasePage(pydantic.BaseModel, typing.Generic[T]):
|
23
|
+
has_next: bool
|
24
|
+
items: typing.Optional[typing.List[T]]
|
25
|
+
|
26
|
+
|
27
|
+
class SyncPage(BasePage[T], typing.Generic[T]):
|
28
|
+
get_next: typing.Optional[typing.Callable[[], typing.Optional[Self]]]
|
29
|
+
|
30
|
+
|
31
|
+
class AsyncPage(BasePage[T], typing.Generic[T]):
|
32
|
+
get_next: typing.Optional[typing.Callable[[], typing.Awaitable[typing.Optional[Self]]]]
|
33
|
+
|
34
|
+
|
35
|
+
# ----------------------------
|
36
|
+
|
37
|
+
|
38
|
+
class SyncPager(SyncPage[T], typing.Generic[T]):
|
39
|
+
# Here we type ignore the iterator to avoid a mypy error
|
40
|
+
# caused by the type conflict with Pydanitc's __iter__ method
|
41
|
+
# brought in by extending the base model
|
42
|
+
def __iter__(self) -> typing.Iterator[T]: # type: ignore
|
43
|
+
for page in self.iter_pages():
|
44
|
+
if page.items is not None:
|
45
|
+
for item in page.items:
|
46
|
+
yield item
|
47
|
+
|
48
|
+
def iter_pages(self) -> typing.Iterator[SyncPage[T]]:
|
49
|
+
page: typing.Union[SyncPager[T], None] = self
|
50
|
+
while True:
|
51
|
+
if page is not None:
|
52
|
+
yield page
|
53
|
+
if page.has_next and page.get_next is not None:
|
54
|
+
page = page.get_next()
|
55
|
+
if page is None or page.items is None or len(page.items) == 0:
|
56
|
+
return
|
57
|
+
else:
|
58
|
+
return
|
59
|
+
else:
|
60
|
+
return
|
61
|
+
|
62
|
+
def next_page(self) -> typing.Optional[SyncPage[T]]:
|
63
|
+
return self.get_next() if self.get_next is not None else None
|
64
|
+
|
65
|
+
|
66
|
+
class AsyncPager(AsyncPage[T], typing.Generic[T]):
|
67
|
+
async def __aiter__(self) -> typing.AsyncIterator[T]: # type: ignore
|
68
|
+
async for page in self.iter_pages():
|
69
|
+
if page.items is not None:
|
70
|
+
for item in page.items:
|
71
|
+
yield item
|
72
|
+
|
73
|
+
async def iter_pages(self) -> typing.AsyncIterator[AsyncPage[T]]:
|
74
|
+
page: typing.Union[AsyncPager[T], None] = self
|
75
|
+
while True:
|
76
|
+
if page is not None:
|
77
|
+
yield page
|
78
|
+
if page is not None and page.has_next and page.get_next is not None:
|
79
|
+
page = await page.get_next()
|
80
|
+
if page is None or page.items is None or len(page.items) == 0:
|
81
|
+
return
|
82
|
+
else:
|
83
|
+
return
|
84
|
+
else:
|
85
|
+
return
|
86
|
+
|
87
|
+
async def next_page(self) -> typing.Optional[AsyncPage[T]]:
|
88
|
+
return await self.get_next() if self.get_next is not None else None
|
cartesia/infill/client.py
CHANGED
@@ -42,7 +42,7 @@ class InfillClient:
|
|
42
42
|
|
43
43
|
**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
|
44
44
|
|
45
|
-
|
45
|
+
Infilling is only available on `sonic-2` at this time.
|
46
46
|
|
47
47
|
At least one of `left_audio` or `right_audio` must be provided.
|
48
48
|
|
@@ -117,7 +117,7 @@ class InfillClient:
|
|
117
117
|
api_key="YOUR_API_KEY",
|
118
118
|
)
|
119
119
|
client.infill.bytes(
|
120
|
-
model_id="sonic-
|
120
|
+
model_id="sonic-2",
|
121
121
|
language="en",
|
122
122
|
transcript="middle segment",
|
123
123
|
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
@@ -189,7 +189,7 @@ class AsyncInfillClient:
|
|
189
189
|
|
190
190
|
**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
|
191
191
|
|
192
|
-
|
192
|
+
Infilling is only available on `sonic-2` at this time.
|
193
193
|
|
194
194
|
At least one of `left_audio` or `right_audio` must be provided.
|
195
195
|
|
@@ -269,7 +269,7 @@ class AsyncInfillClient:
|
|
269
269
|
|
270
270
|
async def main() -> None:
|
271
271
|
await client.infill.bytes(
|
272
|
-
model_id="sonic-
|
272
|
+
model_id="sonic-2",
|
273
273
|
language="en",
|
274
274
|
transcript="middle segment",
|
275
275
|
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
cartesia/tts/_async_websocket.py
CHANGED
@@ -17,6 +17,7 @@ from cartesia.tts.types import (
|
|
17
17
|
WebSocketResponse_FlushDone,
|
18
18
|
WebSocketTtsOutput,
|
19
19
|
WordTimestamps,
|
20
|
+
PhonemeTimestamps,
|
20
21
|
)
|
21
22
|
|
22
23
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -67,6 +68,8 @@ class _AsyncTTSContext:
|
|
67
68
|
language: Optional[str] = None,
|
68
69
|
stream: bool = True,
|
69
70
|
add_timestamps: bool = False,
|
71
|
+
add_phoneme_timestamps: bool = False,
|
72
|
+
use_original_timestamps: bool = False,
|
70
73
|
continue_: bool = False,
|
71
74
|
flush: bool = False,
|
72
75
|
) -> None:
|
@@ -102,6 +105,10 @@ class _AsyncTTSContext:
|
|
102
105
|
request_body["stream"] = stream
|
103
106
|
if add_timestamps:
|
104
107
|
request_body["add_timestamps"] = add_timestamps
|
108
|
+
if add_phoneme_timestamps:
|
109
|
+
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
110
|
+
if use_original_timestamps:
|
111
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
105
112
|
if continue_:
|
106
113
|
request_body["continue"] = continue_
|
107
114
|
if flush:
|
@@ -229,6 +236,11 @@ class _AsyncTTSContext:
|
|
229
236
|
finally:
|
230
237
|
self._close()
|
231
238
|
|
239
|
+
async def cancel(self):
|
240
|
+
"""Cancel the context. This will stop the generation of audio for this context."""
|
241
|
+
await self._websocket.websocket.send_json({"context_id": self._context_id, "cancel": True})
|
242
|
+
self._close()
|
243
|
+
|
232
244
|
def _close(self) -> None:
|
233
245
|
"""Closes the context. Automatically called when a done message is received for this context."""
|
234
246
|
self._websocket._remove_context(self._context_id)
|
@@ -297,7 +309,26 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
297
309
|
try:
|
298
310
|
self.websocket = await session.ws_connect(url)
|
299
311
|
except Exception as e:
|
300
|
-
|
312
|
+
# Extract status code if available
|
313
|
+
status_code = None
|
314
|
+
error_message = str(e)
|
315
|
+
|
316
|
+
if hasattr(e, 'status') and e.status is not None:
|
317
|
+
status_code = e.status
|
318
|
+
|
319
|
+
# Create a meaningful error message based on status code
|
320
|
+
if status_code == 402:
|
321
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
322
|
+
elif status_code == 401:
|
323
|
+
error_message = "Unauthorized. Please check your API key."
|
324
|
+
elif status_code == 403:
|
325
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
326
|
+
elif status_code == 404:
|
327
|
+
error_message = "Not found. The requested resource doesn't exist."
|
328
|
+
|
329
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
330
|
+
else:
|
331
|
+
raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
|
301
332
|
|
302
333
|
def _is_websocket_closed(self):
|
303
334
|
return self.websocket.closed
|
@@ -338,6 +369,8 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
338
369
|
language: Optional[str] = None,
|
339
370
|
stream: bool = True,
|
340
371
|
add_timestamps: bool = False,
|
372
|
+
add_phoneme_timestamps: bool = False,
|
373
|
+
use_original_timestamps: bool = False,
|
341
374
|
):
|
342
375
|
"""See :meth:`_WebSocket.send` for details."""
|
343
376
|
if context_id is None:
|
@@ -355,6 +388,8 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
355
388
|
language=language,
|
356
389
|
continue_=False,
|
357
390
|
add_timestamps=add_timestamps,
|
391
|
+
add_phoneme_timestamps=add_phoneme_timestamps,
|
392
|
+
use_original_timestamps=use_original_timestamps,
|
358
393
|
)
|
359
394
|
|
360
395
|
generator = ctx.receive()
|
@@ -366,6 +401,9 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
366
401
|
words: typing.List[str] = []
|
367
402
|
start: typing.List[float] = []
|
368
403
|
end: typing.List[float] = []
|
404
|
+
phonemes: typing.List[str] = []
|
405
|
+
phoneme_start: typing.List[float] = []
|
406
|
+
phoneme_end: typing.List[float] = []
|
369
407
|
async for chunk in generator:
|
370
408
|
if chunk.audio is not None:
|
371
409
|
chunks.append(chunk.audio)
|
@@ -374,6 +412,11 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
374
412
|
words.extend(chunk.word_timestamps.words)
|
375
413
|
start.extend(chunk.word_timestamps.start)
|
376
414
|
end.extend(chunk.word_timestamps.end)
|
415
|
+
if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
|
416
|
+
if chunk.phoneme_timestamps is not None:
|
417
|
+
phonemes.extend(chunk.phoneme_timestamps.phonemes)
|
418
|
+
phoneme_start.extend(chunk.phoneme_timestamps.start)
|
419
|
+
phoneme_end.extend(chunk.phoneme_timestamps.end)
|
377
420
|
|
378
421
|
return WebSocketTtsOutput(
|
379
422
|
audio=b"".join(chunks), # type: ignore
|
@@ -387,6 +430,15 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
387
430
|
if add_timestamps
|
388
431
|
else None
|
389
432
|
),
|
433
|
+
phoneme_timestamps=(
|
434
|
+
PhonemeTimestamps(
|
435
|
+
phonemes=phonemes,
|
436
|
+
start=phoneme_start,
|
437
|
+
end=phoneme_end,
|
438
|
+
)
|
439
|
+
if add_phoneme_timestamps
|
440
|
+
else None
|
441
|
+
),
|
390
442
|
)
|
391
443
|
|
392
444
|
async def _process_responses(self):
|
cartesia/tts/_websocket.py
CHANGED
@@ -26,6 +26,7 @@ from cartesia.tts.types import (
|
|
26
26
|
WebSocketResponse_Timestamps,
|
27
27
|
WebSocketTtsOutput,
|
28
28
|
WordTimestamps,
|
29
|
+
PhonemeTimestamps,
|
29
30
|
)
|
30
31
|
|
31
32
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -58,7 +59,7 @@ class _TTSContext:
|
|
58
59
|
self,
|
59
60
|
*,
|
60
61
|
model_id: str,
|
61
|
-
transcript: str,
|
62
|
+
transcript: typing.Generator[str, None, None],
|
62
63
|
output_format: OutputFormatParams,
|
63
64
|
voice: TtsRequestVoiceSpecifierParams,
|
64
65
|
context_id: Optional[str] = None,
|
@@ -66,6 +67,8 @@ class _TTSContext:
|
|
66
67
|
language: Optional[str] = None,
|
67
68
|
stream: bool = True,
|
68
69
|
add_timestamps: bool = False,
|
70
|
+
add_phoneme_timestamps: bool = False,
|
71
|
+
use_original_timestamps: bool = False,
|
69
72
|
) -> Generator[bytes, None, None]:
|
70
73
|
"""Send audio generation requests to the WebSocket and yield responses.
|
71
74
|
|
@@ -101,6 +104,10 @@ class _TTSContext:
|
|
101
104
|
request_body["stream"] = stream
|
102
105
|
if add_timestamps:
|
103
106
|
request_body["add_timestamps"] = add_timestamps
|
107
|
+
if add_phoneme_timestamps:
|
108
|
+
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
109
|
+
if use_original_timestamps:
|
110
|
+
request_body["use_original_timestamps"] = use_original_timestamps
|
104
111
|
|
105
112
|
if (
|
106
113
|
"context_id" in request_body
|
@@ -235,7 +242,7 @@ class TtsWebsocket:
|
|
235
242
|
Usage:
|
236
243
|
>>> ws = client.tts.websocket()
|
237
244
|
>>> generation_request = GenerationRequest(
|
238
|
-
... model_id="sonic-
|
245
|
+
... model_id="sonic-2",
|
239
246
|
... transcript="Hello world!",
|
240
247
|
... voice_embedding=embedding
|
241
248
|
... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}
|
@@ -281,7 +288,26 @@ class TtsWebsocket:
|
|
281
288
|
f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
|
282
289
|
)
|
283
290
|
except Exception as e:
|
284
|
-
|
291
|
+
# Extract status code if available
|
292
|
+
status_code = None
|
293
|
+
error_message = str(e)
|
294
|
+
|
295
|
+
if hasattr(e, 'status') and e.status is not None:
|
296
|
+
status_code = e.status
|
297
|
+
|
298
|
+
# Create a meaningful error message based on status code
|
299
|
+
if status_code == 402:
|
300
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
301
|
+
elif status_code == 401:
|
302
|
+
error_message = "Unauthorized. Please check your API key."
|
303
|
+
elif status_code == 403:
|
304
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
305
|
+
elif status_code == 404:
|
306
|
+
error_message = "Not found. The requested resource doesn't exist."
|
307
|
+
|
308
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
309
|
+
else:
|
310
|
+
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
285
311
|
|
286
312
|
def _is_websocket_closed(self):
|
287
313
|
return self.websocket.socket.fileno() == -1
|
@@ -310,6 +336,8 @@ class TtsWebsocket:
|
|
310
336
|
out["audio"] = base64.b64decode(response.data)
|
311
337
|
elif isinstance(response, WebSocketResponse_Timestamps):
|
312
338
|
out["word_timestamps"] = response.word_timestamps # type: ignore
|
339
|
+
elif isinstance(response, WebSocketResponse_PhonemeTimestamps):
|
340
|
+
out["phoneme_timestamps"] = response.phoneme_timestamps # type: ignore
|
313
341
|
elif include_flush_id and isinstance(response, WebSocketResponse_FlushDone):
|
314
342
|
out["flush_done"] = response.flush_done # type: ignore
|
315
343
|
out["flush_id"] = response.flush_id # type: ignore
|
@@ -331,6 +359,8 @@ class TtsWebsocket:
|
|
331
359
|
language: Optional[str] = None,
|
332
360
|
stream: bool = True,
|
333
361
|
add_timestamps: bool = False,
|
362
|
+
add_phoneme_timestamps: bool = False,
|
363
|
+
use_original_timestamps: bool = False,
|
334
364
|
):
|
335
365
|
"""Send a request to the WebSocket to generate audio.
|
336
366
|
|
@@ -360,6 +390,8 @@ class TtsWebsocket:
|
|
360
390
|
"language": language,
|
361
391
|
"stream": stream,
|
362
392
|
"add_timestamps": add_timestamps,
|
393
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
394
|
+
"use_original_timestamps": use_original_timestamps,
|
363
395
|
}
|
364
396
|
generator = self._websocket_generator(request_body)
|
365
397
|
|
@@ -370,6 +402,9 @@ class TtsWebsocket:
|
|
370
402
|
words: typing.List[str] = []
|
371
403
|
start: typing.List[float] = []
|
372
404
|
end: typing.List[float] = []
|
405
|
+
phonemes: typing.List[str] = []
|
406
|
+
phoneme_start: typing.List[float] = []
|
407
|
+
phoneme_end: typing.List[float] = []
|
373
408
|
for chunk in generator:
|
374
409
|
if chunk.audio is not None:
|
375
410
|
chunks.append(chunk.audio)
|
@@ -378,6 +413,11 @@ class TtsWebsocket:
|
|
378
413
|
words.extend(chunk.word_timestamps.words)
|
379
414
|
start.extend(chunk.word_timestamps.start)
|
380
415
|
end.extend(chunk.word_timestamps.end)
|
416
|
+
if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
|
417
|
+
if chunk.phoneme_timestamps is not None:
|
418
|
+
phonemes.extend(chunk.phoneme_timestamps.phonemes)
|
419
|
+
phoneme_start.extend(chunk.phoneme_timestamps.start)
|
420
|
+
phoneme_end.extend(chunk.phoneme_timestamps.end)
|
381
421
|
|
382
422
|
return WebSocketTtsOutput(
|
383
423
|
audio=b"".join(chunks), # type: ignore
|
@@ -391,6 +431,15 @@ class TtsWebsocket:
|
|
391
431
|
if add_timestamps
|
392
432
|
else None
|
393
433
|
),
|
434
|
+
phoneme_timestamps=(
|
435
|
+
PhonemeTimestamps(
|
436
|
+
phonemes=phonemes,
|
437
|
+
start=phoneme_start,
|
438
|
+
end=phoneme_end,
|
439
|
+
)
|
440
|
+
if add_phoneme_timestamps
|
441
|
+
else None
|
442
|
+
),
|
394
443
|
)
|
395
444
|
|
396
445
|
def _websocket_generator(self, request_body: Dict[str, Any]):
|
cartesia/tts/client.py
CHANGED
@@ -67,7 +67,7 @@ class TtsClient:
|
|
67
67
|
api_key="YOUR_API_KEY",
|
68
68
|
)
|
69
69
|
client.tts.bytes(
|
70
|
-
model_id="sonic",
|
70
|
+
model_id="sonic-2",
|
71
71
|
transcript="Hello, world!",
|
72
72
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
73
73
|
language="en",
|
@@ -152,7 +152,7 @@ class TtsClient:
|
|
152
152
|
api_key="YOUR_API_KEY",
|
153
153
|
)
|
154
154
|
response = client.tts.sse(
|
155
|
-
model_id="sonic",
|
155
|
+
model_id="sonic-2",
|
156
156
|
transcript="Hello, world!",
|
157
157
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
158
158
|
language="en",
|
@@ -258,7 +258,7 @@ class AsyncTtsClient:
|
|
258
258
|
|
259
259
|
async def main() -> None:
|
260
260
|
await client.tts.bytes(
|
261
|
-
model_id="sonic",
|
261
|
+
model_id="sonic-2",
|
262
262
|
transcript="Hello, world!",
|
263
263
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
264
264
|
language="en",
|
@@ -351,7 +351,7 @@ class AsyncTtsClient:
|
|
351
351
|
|
352
352
|
async def main() -> None:
|
353
353
|
response = await client.tts.sse(
|
354
|
-
model_id="sonic",
|
354
|
+
model_id="sonic-2",
|
355
355
|
transcript="Hello, world!",
|
356
356
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
357
357
|
language="en",
|
@@ -1,8 +1,11 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
4
|
+
import typing_extensions
|
5
|
+
from ..types.flush_id import FlushId
|
4
6
|
|
5
7
|
|
6
8
|
class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
|
7
9
|
data: str
|
8
10
|
step_time: float
|
11
|
+
flush_id: typing_extensions.NotRequired[FlushId]
|
@@ -4,8 +4,8 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ..types.context_id import ContextId
|
8
7
|
from ..types.flush_id import FlushId
|
8
|
+
from ..types.context_id import ContextId
|
9
9
|
from .word_timestamps import WordTimestampsParams
|
10
10
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
11
11
|
|
@@ -14,6 +14,7 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
|
|
14
14
|
type: typing.Literal["chunk"]
|
15
15
|
data: str
|
16
16
|
step_time: float
|
17
|
+
flush_id: typing_extensions.NotRequired[FlushId]
|
17
18
|
context_id: typing_extensions.NotRequired[ContextId]
|
18
19
|
status_code: int
|
19
20
|
done: bool
|
@@ -19,6 +19,7 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
19
19
|
duration: typing_extensions.NotRequired[int]
|
20
20
|
language: typing_extensions.NotRequired[str]
|
21
21
|
add_timestamps: typing_extensions.NotRequired[bool]
|
22
|
+
use_original_timestamps: typing_extensions.NotRequired[bool]
|
22
23
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
23
24
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
24
25
|
context_id: typing_extensions.NotRequired[str]
|
cartesia/tts/types/emotion.py
CHANGED
@@ -6,22 +6,27 @@ Emotion = typing.Union[
|
|
6
6
|
typing.Literal[
|
7
7
|
"anger:lowest",
|
8
8
|
"anger:low",
|
9
|
+
"anger",
|
9
10
|
"anger:high",
|
10
11
|
"anger:highest",
|
11
12
|
"positivity:lowest",
|
12
13
|
"positivity:low",
|
14
|
+
"positivity",
|
13
15
|
"positivity:high",
|
14
16
|
"positivity:highest",
|
15
17
|
"surprise:lowest",
|
16
18
|
"surprise:low",
|
19
|
+
"surprise",
|
17
20
|
"surprise:high",
|
18
21
|
"surprise:highest",
|
19
22
|
"sadness:lowest",
|
20
23
|
"sadness:low",
|
24
|
+
"sadness",
|
21
25
|
"sadness:high",
|
22
26
|
"sadness:highest",
|
23
27
|
"curiosity:lowest",
|
24
28
|
"curiosity:low",
|
29
|
+
"curiosity",
|
25
30
|
"curiosity:high",
|
26
31
|
"curiosity:highest",
|
27
32
|
],
|
@@ -56,6 +56,11 @@ class GenerationRequest(UniversalBaseModel):
|
|
56
56
|
Whether to return phoneme-level timestamps.
|
57
57
|
"""
|
58
58
|
|
59
|
+
use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
60
|
+
"""
|
61
|
+
Whether to use the original transcript for timestamps.
|
62
|
+
"""
|
63
|
+
|
59
64
|
if IS_PYDANTIC_V2:
|
60
65
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
61
66
|
else:
|
@@ -1,14 +1,16 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponse
|
4
|
-
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
5
4
|
import typing
|
5
|
+
from .flush_id import FlushId
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
7
|
import pydantic
|
7
8
|
|
8
9
|
|
9
10
|
class WebSocketChunkResponse(WebSocketBaseResponse):
|
10
11
|
data: str
|
11
12
|
step_time: float
|
13
|
+
flush_id: typing.Optional[FlushId] = None
|
12
14
|
|
13
15
|
if IS_PYDANTIC_V2:
|
14
16
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,10 +3,10 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
+
from .flush_id import FlushId
|
6
7
|
from .context_id import ContextId
|
7
8
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
9
|
import pydantic
|
9
|
-
from .flush_id import FlushId
|
10
10
|
from .word_timestamps import WordTimestamps
|
11
11
|
from .phoneme_timestamps import PhonemeTimestamps
|
12
12
|
|
@@ -15,6 +15,7 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
|
|
15
15
|
type: typing.Literal["chunk"] = "chunk"
|
16
16
|
data: str
|
17
17
|
step_time: float
|
18
|
+
flush_id: typing.Optional[FlushId] = None
|
18
19
|
context_id: typing.Optional[ContextId] = None
|
19
20
|
status_code: int
|
20
21
|
done: bool
|