cartesia 2.0.0b1__py3-none-any.whl → 2.0.0b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/__init__.py +8 -4
- cartesia/base_client.py +0 -4
- cartesia/core/__init__.py +3 -0
- cartesia/core/client_wrapper.py +2 -2
- cartesia/core/pagination.py +88 -0
- cartesia/infill/client.py +4 -4
- cartesia/tts/_async_websocket.py +48 -1
- cartesia/tts/_websocket.py +44 -3
- cartesia/tts/client.py +4 -4
- cartesia/tts/requests/generation_request.py +5 -0
- cartesia/tts/requests/web_socket_chunk_response.py +3 -0
- cartesia/tts/requests/web_socket_response.py +2 -1
- cartesia/tts/requests/web_socket_tts_request.py +1 -0
- cartesia/tts/types/emotion.py +5 -0
- cartesia/tts/types/generation_request.py +5 -0
- cartesia/tts/types/web_socket_chunk_response.py +3 -1
- cartesia/tts/types/web_socket_response.py +2 -1
- cartesia/tts/types/web_socket_tts_output.py +2 -0
- cartesia/tts/types/web_socket_tts_request.py +1 -0
- cartesia/tts/utils/constants.py +2 -2
- cartesia/voice_changer/requests/streaming_response.py +2 -0
- cartesia/voice_changer/types/streaming_response.py +2 -0
- cartesia/voices/__init__.py +8 -4
- cartesia/voices/client.py +285 -169
- cartesia/voices/requests/__init__.py +2 -0
- cartesia/voices/requests/create_voice_request.py +0 -2
- cartesia/voices/requests/get_voices_response.py +24 -0
- cartesia/voices/requests/localize_dialect.py +1 -3
- cartesia/voices/requests/voice.py +13 -9
- cartesia/voices/types/__init__.py +6 -4
- cartesia/voices/types/create_voice_request.py +0 -2
- cartesia/voices/types/gender_presentation.py +5 -0
- cartesia/voices/types/get_voices_response.py +34 -0
- cartesia/voices/types/localize_dialect.py +1 -3
- cartesia/voices/types/voice.py +13 -9
- cartesia/voices/types/voice_expand_options.py +5 -0
- {cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/METADATA +151 -49
- {cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/RECORD +39 -37
- cartesia/datasets/client.py +0 -392
- cartesia/voices/types/localize_portuguese_dialect.py +0 -5
- cartesia/voices/types/localize_spanish_dialect.py +0 -5
- {cartesia-2.0.0b1.dist-info → cartesia-2.0.0b7.dist-info}/WHEEL +0 -0
cartesia/__init__.py
CHANGED
@@ -121,13 +121,14 @@ from .voices import (
|
|
121
121
|
EmbeddingSpecifier,
|
122
122
|
EmbeddingSpecifierParams,
|
123
123
|
Gender,
|
124
|
+
GenderPresentation,
|
125
|
+
GetVoicesResponse,
|
126
|
+
GetVoicesResponseParams,
|
124
127
|
IdSpecifier,
|
125
128
|
IdSpecifierParams,
|
126
129
|
LocalizeDialect,
|
127
130
|
LocalizeDialectParams,
|
128
131
|
LocalizeEnglishDialect,
|
129
|
-
LocalizePortugueseDialect,
|
130
|
-
LocalizeSpanishDialect,
|
131
132
|
LocalizeTargetLanguage,
|
132
133
|
LocalizeVoiceRequest,
|
133
134
|
LocalizeVoiceRequestParams,
|
@@ -138,6 +139,7 @@ from .voices import (
|
|
138
139
|
UpdateVoiceRequest,
|
139
140
|
UpdateVoiceRequestParams,
|
140
141
|
Voice,
|
142
|
+
VoiceExpandOptions,
|
141
143
|
VoiceId,
|
142
144
|
VoiceMetadata,
|
143
145
|
VoiceMetadataParams,
|
@@ -175,15 +177,16 @@ __all__ = [
|
|
175
177
|
"FilePurpose",
|
176
178
|
"FlushId",
|
177
179
|
"Gender",
|
180
|
+
"GenderPresentation",
|
178
181
|
"GenerationRequest",
|
179
182
|
"GenerationRequestParams",
|
183
|
+
"GetVoicesResponse",
|
184
|
+
"GetVoicesResponseParams",
|
180
185
|
"IdSpecifier",
|
181
186
|
"IdSpecifierParams",
|
182
187
|
"LocalizeDialect",
|
183
188
|
"LocalizeDialectParams",
|
184
189
|
"LocalizeEnglishDialect",
|
185
|
-
"LocalizePortugueseDialect",
|
186
|
-
"LocalizeSpanishDialect",
|
187
190
|
"LocalizeTargetLanguage",
|
188
191
|
"LocalizeVoiceRequest",
|
189
192
|
"LocalizeVoiceRequestParams",
|
@@ -235,6 +238,7 @@ __all__ = [
|
|
235
238
|
"UpdateVoiceRequest",
|
236
239
|
"UpdateVoiceRequestParams",
|
237
240
|
"Voice",
|
241
|
+
"VoiceExpandOptions",
|
238
242
|
"VoiceId",
|
239
243
|
"VoiceMetadata",
|
240
244
|
"VoiceMetadataParams",
|
cartesia/base_client.py
CHANGED
@@ -5,14 +5,12 @@ from .environment import CartesiaEnvironment
|
|
5
5
|
import httpx
|
6
6
|
from .core.client_wrapper import SyncClientWrapper
|
7
7
|
from .api_status.client import ApiStatusClient
|
8
|
-
from .datasets.client import DatasetsClient
|
9
8
|
from .infill.client import InfillClient
|
10
9
|
from .tts.client import TtsClient
|
11
10
|
from .voice_changer.client import VoiceChangerClient
|
12
11
|
from .voices.client import VoicesClient
|
13
12
|
from .core.client_wrapper import AsyncClientWrapper
|
14
13
|
from .api_status.client import AsyncApiStatusClient
|
15
|
-
from .datasets.client import AsyncDatasetsClient
|
16
14
|
from .infill.client import AsyncInfillClient
|
17
15
|
from .tts.client import AsyncTtsClient
|
18
16
|
from .voice_changer.client import AsyncVoiceChangerClient
|
@@ -78,7 +76,6 @@ class BaseCartesia:
|
|
78
76
|
timeout=_defaulted_timeout,
|
79
77
|
)
|
80
78
|
self.api_status = ApiStatusClient(client_wrapper=self._client_wrapper)
|
81
|
-
self.datasets = DatasetsClient(client_wrapper=self._client_wrapper)
|
82
79
|
self.infill = InfillClient(client_wrapper=self._client_wrapper)
|
83
80
|
self.tts = TtsClient(client_wrapper=self._client_wrapper)
|
84
81
|
self.voice_changer = VoiceChangerClient(client_wrapper=self._client_wrapper)
|
@@ -144,7 +141,6 @@ class AsyncBaseCartesia:
|
|
144
141
|
timeout=_defaulted_timeout,
|
145
142
|
)
|
146
143
|
self.api_status = AsyncApiStatusClient(client_wrapper=self._client_wrapper)
|
147
|
-
self.datasets = AsyncDatasetsClient(client_wrapper=self._client_wrapper)
|
148
144
|
self.infill = AsyncInfillClient(client_wrapper=self._client_wrapper)
|
149
145
|
self.tts = AsyncTtsClient(client_wrapper=self._client_wrapper)
|
150
146
|
self.voice_changer = AsyncVoiceChangerClient(client_wrapper=self._client_wrapper)
|
cartesia/core/__init__.py
CHANGED
@@ -6,6 +6,7 @@ from .datetime_utils import serialize_datetime
|
|
6
6
|
from .file import File, convert_file_dict_to_httpx_tuples, with_content_type
|
7
7
|
from .http_client import AsyncHttpClient, HttpClient
|
8
8
|
from .jsonable_encoder import jsonable_encoder
|
9
|
+
from .pagination import AsyncPager, SyncPager
|
9
10
|
from .pydantic_utilities import (
|
10
11
|
IS_PYDANTIC_V2,
|
11
12
|
UniversalBaseModel,
|
@@ -24,6 +25,7 @@ __all__ = [
|
|
24
25
|
"ApiError",
|
25
26
|
"AsyncClientWrapper",
|
26
27
|
"AsyncHttpClient",
|
28
|
+
"AsyncPager",
|
27
29
|
"BaseClientWrapper",
|
28
30
|
"FieldMetadata",
|
29
31
|
"File",
|
@@ -31,6 +33,7 @@ __all__ = [
|
|
31
33
|
"IS_PYDANTIC_V2",
|
32
34
|
"RequestOptions",
|
33
35
|
"SyncClientWrapper",
|
36
|
+
"SyncPager",
|
34
37
|
"UniversalBaseModel",
|
35
38
|
"UniversalRootModel",
|
36
39
|
"convert_and_respect_annotation_metadata",
|
cartesia/core/client_wrapper.py
CHANGED
@@ -16,10 +16,10 @@ class BaseClientWrapper:
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
17
17
|
"X-Fern-Language": "Python",
|
18
18
|
"X-Fern-SDK-Name": "cartesia",
|
19
|
-
"X-Fern-SDK-Version": "2.0.
|
19
|
+
"X-Fern-SDK-Version": "2.0.0b7",
|
20
20
|
}
|
21
21
|
headers["X-API-Key"] = self.api_key
|
22
|
-
headers["Cartesia-Version"] = "2024-
|
22
|
+
headers["Cartesia-Version"] = "2024-11-13"
|
23
23
|
return headers
|
24
24
|
|
25
25
|
def get_base_url(self) -> str:
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
2
|
+
|
3
|
+
import typing
|
4
|
+
|
5
|
+
from typing_extensions import Self
|
6
|
+
|
7
|
+
import pydantic
|
8
|
+
|
9
|
+
# Generic to represent the underlying type of the results within a page
|
10
|
+
T = typing.TypeVar("T")
|
11
|
+
|
12
|
+
|
13
|
+
# SDKs implement a Page ABC per-pagination request, the endpoint then returns a pager that wraps this type
|
14
|
+
# for example, an endpoint will return SyncPager[UserPage] where UserPage implements the Page ABC. ex:
|
15
|
+
#
|
16
|
+
# SyncPager<InnerListType>(
|
17
|
+
# has_next=response.list_metadata.after is not None,
|
18
|
+
# items=response.data,
|
19
|
+
# # This should be the outer function that returns the SyncPager again
|
20
|
+
# get_next=lambda: list(..., cursor: response.cursor) (or list(..., offset: offset + 1))
|
21
|
+
# )
|
22
|
+
class BasePage(pydantic.BaseModel, typing.Generic[T]):
|
23
|
+
has_next: bool
|
24
|
+
items: typing.Optional[typing.List[T]]
|
25
|
+
|
26
|
+
|
27
|
+
class SyncPage(BasePage[T], typing.Generic[T]):
|
28
|
+
get_next: typing.Optional[typing.Callable[[], typing.Optional[Self]]]
|
29
|
+
|
30
|
+
|
31
|
+
class AsyncPage(BasePage[T], typing.Generic[T]):
|
32
|
+
get_next: typing.Optional[typing.Callable[[], typing.Awaitable[typing.Optional[Self]]]]
|
33
|
+
|
34
|
+
|
35
|
+
# ----------------------------
|
36
|
+
|
37
|
+
|
38
|
+
class SyncPager(SyncPage[T], typing.Generic[T]):
|
39
|
+
# Here we type ignore the iterator to avoid a mypy error
|
40
|
+
# caused by the type conflict with Pydanitc's __iter__ method
|
41
|
+
# brought in by extending the base model
|
42
|
+
def __iter__(self) -> typing.Iterator[T]: # type: ignore
|
43
|
+
for page in self.iter_pages():
|
44
|
+
if page.items is not None:
|
45
|
+
for item in page.items:
|
46
|
+
yield item
|
47
|
+
|
48
|
+
def iter_pages(self) -> typing.Iterator[SyncPage[T]]:
|
49
|
+
page: typing.Union[SyncPager[T], None] = self
|
50
|
+
while True:
|
51
|
+
if page is not None:
|
52
|
+
yield page
|
53
|
+
if page.has_next and page.get_next is not None:
|
54
|
+
page = page.get_next()
|
55
|
+
if page is None or page.items is None or len(page.items) == 0:
|
56
|
+
return
|
57
|
+
else:
|
58
|
+
return
|
59
|
+
else:
|
60
|
+
return
|
61
|
+
|
62
|
+
def next_page(self) -> typing.Optional[SyncPage[T]]:
|
63
|
+
return self.get_next() if self.get_next is not None else None
|
64
|
+
|
65
|
+
|
66
|
+
class AsyncPager(AsyncPage[T], typing.Generic[T]):
|
67
|
+
async def __aiter__(self) -> typing.AsyncIterator[T]: # type: ignore
|
68
|
+
async for page in self.iter_pages():
|
69
|
+
if page.items is not None:
|
70
|
+
for item in page.items:
|
71
|
+
yield item
|
72
|
+
|
73
|
+
async def iter_pages(self) -> typing.AsyncIterator[AsyncPage[T]]:
|
74
|
+
page: typing.Union[AsyncPager[T], None] = self
|
75
|
+
while True:
|
76
|
+
if page is not None:
|
77
|
+
yield page
|
78
|
+
if page is not None and page.has_next and page.get_next is not None:
|
79
|
+
page = await page.get_next()
|
80
|
+
if page is None or page.items is None or len(page.items) == 0:
|
81
|
+
return
|
82
|
+
else:
|
83
|
+
return
|
84
|
+
else:
|
85
|
+
return
|
86
|
+
|
87
|
+
async def next_page(self) -> typing.Optional[AsyncPage[T]]:
|
88
|
+
return await self.get_next() if self.get_next is not None else None
|
cartesia/infill/client.py
CHANGED
@@ -42,7 +42,7 @@ class InfillClient:
|
|
42
42
|
|
43
43
|
**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
|
44
44
|
|
45
|
-
|
45
|
+
Infilling is only available on `sonic-2` at this time.
|
46
46
|
|
47
47
|
At least one of `left_audio` or `right_audio` must be provided.
|
48
48
|
|
@@ -117,7 +117,7 @@ class InfillClient:
|
|
117
117
|
api_key="YOUR_API_KEY",
|
118
118
|
)
|
119
119
|
client.infill.bytes(
|
120
|
-
model_id="sonic-
|
120
|
+
model_id="sonic-2",
|
121
121
|
language="en",
|
122
122
|
transcript="middle segment",
|
123
123
|
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
@@ -189,7 +189,7 @@ class AsyncInfillClient:
|
|
189
189
|
|
190
190
|
**The cost is 1 credit per character of the infill text plus a fixed cost of 300 credits.**
|
191
191
|
|
192
|
-
|
192
|
+
Infilling is only available on `sonic-2` at this time.
|
193
193
|
|
194
194
|
At least one of `left_audio` or `right_audio` must be provided.
|
195
195
|
|
@@ -269,7 +269,7 @@ class AsyncInfillClient:
|
|
269
269
|
|
270
270
|
async def main() -> None:
|
271
271
|
await client.infill.bytes(
|
272
|
-
model_id="sonic-
|
272
|
+
model_id="sonic-2",
|
273
273
|
language="en",
|
274
274
|
transcript="middle segment",
|
275
275
|
voice_id="694f9389-aac1-45b6-b726-9d9369183238",
|
cartesia/tts/_async_websocket.py
CHANGED
@@ -17,6 +17,7 @@ from cartesia.tts.types import (
|
|
17
17
|
WebSocketResponse_FlushDone,
|
18
18
|
WebSocketTtsOutput,
|
19
19
|
WordTimestamps,
|
20
|
+
PhonemeTimestamps,
|
20
21
|
)
|
21
22
|
|
22
23
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -67,6 +68,7 @@ class _AsyncTTSContext:
|
|
67
68
|
language: Optional[str] = None,
|
68
69
|
stream: bool = True,
|
69
70
|
add_timestamps: bool = False,
|
71
|
+
add_phoneme_timestamps: bool = False,
|
70
72
|
continue_: bool = False,
|
71
73
|
flush: bool = False,
|
72
74
|
) -> None:
|
@@ -102,6 +104,8 @@ class _AsyncTTSContext:
|
|
102
104
|
request_body["stream"] = stream
|
103
105
|
if add_timestamps:
|
104
106
|
request_body["add_timestamps"] = add_timestamps
|
107
|
+
if add_phoneme_timestamps:
|
108
|
+
request_body["add_phoneme_timestamps"] = add_phoneme_timestamps
|
105
109
|
if continue_:
|
106
110
|
request_body["continue"] = continue_
|
107
111
|
if flush:
|
@@ -229,6 +233,11 @@ class _AsyncTTSContext:
|
|
229
233
|
finally:
|
230
234
|
self._close()
|
231
235
|
|
236
|
+
async def cancel(self):
|
237
|
+
"""Cancel the context. This will stop the generation of audio for this context."""
|
238
|
+
await self._websocket.websocket.send_json({"context_id": self._context_id, "cancel": True})
|
239
|
+
self._close()
|
240
|
+
|
232
241
|
def _close(self) -> None:
|
233
242
|
"""Closes the context. Automatically called when a done message is received for this context."""
|
234
243
|
self._websocket._remove_context(self._context_id)
|
@@ -297,7 +306,26 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
297
306
|
try:
|
298
307
|
self.websocket = await session.ws_connect(url)
|
299
308
|
except Exception as e:
|
300
|
-
|
309
|
+
# Extract status code if available
|
310
|
+
status_code = None
|
311
|
+
error_message = str(e)
|
312
|
+
|
313
|
+
if hasattr(e, 'status') and e.status is not None:
|
314
|
+
status_code = e.status
|
315
|
+
|
316
|
+
# Create a meaningful error message based on status code
|
317
|
+
if status_code == 402:
|
318
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
319
|
+
elif status_code == 401:
|
320
|
+
error_message = "Unauthorized. Please check your API key."
|
321
|
+
elif status_code == 403:
|
322
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
323
|
+
elif status_code == 404:
|
324
|
+
error_message = "Not found. The requested resource doesn't exist."
|
325
|
+
|
326
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
327
|
+
else:
|
328
|
+
raise RuntimeError(f"Failed to connect to WebSocket at {url}. {e}")
|
301
329
|
|
302
330
|
def _is_websocket_closed(self):
|
303
331
|
return self.websocket.closed
|
@@ -338,6 +366,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
338
366
|
language: Optional[str] = None,
|
339
367
|
stream: bool = True,
|
340
368
|
add_timestamps: bool = False,
|
369
|
+
add_phoneme_timestamps: bool = False,
|
341
370
|
):
|
342
371
|
"""See :meth:`_WebSocket.send` for details."""
|
343
372
|
if context_id is None:
|
@@ -355,6 +384,7 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
355
384
|
language=language,
|
356
385
|
continue_=False,
|
357
386
|
add_timestamps=add_timestamps,
|
387
|
+
add_phoneme_timestamps=add_phoneme_timestamps,
|
358
388
|
)
|
359
389
|
|
360
390
|
generator = ctx.receive()
|
@@ -366,6 +396,9 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
366
396
|
words: typing.List[str] = []
|
367
397
|
start: typing.List[float] = []
|
368
398
|
end: typing.List[float] = []
|
399
|
+
phonemes: typing.List[str] = []
|
400
|
+
phoneme_start: typing.List[float] = []
|
401
|
+
phoneme_end: typing.List[float] = []
|
369
402
|
async for chunk in generator:
|
370
403
|
if chunk.audio is not None:
|
371
404
|
chunks.append(chunk.audio)
|
@@ -374,6 +407,11 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
374
407
|
words.extend(chunk.word_timestamps.words)
|
375
408
|
start.extend(chunk.word_timestamps.start)
|
376
409
|
end.extend(chunk.word_timestamps.end)
|
410
|
+
if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
|
411
|
+
if chunk.phoneme_timestamps is not None:
|
412
|
+
phonemes.extend(chunk.phoneme_timestamps.phonemes)
|
413
|
+
phoneme_start.extend(chunk.phoneme_timestamps.start)
|
414
|
+
phoneme_end.extend(chunk.phoneme_timestamps.end)
|
377
415
|
|
378
416
|
return WebSocketTtsOutput(
|
379
417
|
audio=b"".join(chunks), # type: ignore
|
@@ -387,6 +425,15 @@ class AsyncTtsWebsocket(TtsWebsocket):
|
|
387
425
|
if add_timestamps
|
388
426
|
else None
|
389
427
|
),
|
428
|
+
phoneme_timestamps=(
|
429
|
+
PhonemeTimestamps(
|
430
|
+
phonemes=phonemes,
|
431
|
+
start=phoneme_start,
|
432
|
+
end=phoneme_end,
|
433
|
+
)
|
434
|
+
if add_phoneme_timestamps
|
435
|
+
else None
|
436
|
+
),
|
390
437
|
)
|
391
438
|
|
392
439
|
async def _process_responses(self):
|
cartesia/tts/_websocket.py
CHANGED
@@ -26,6 +26,7 @@ from cartesia.tts.types import (
|
|
26
26
|
WebSocketResponse_Timestamps,
|
27
27
|
WebSocketTtsOutput,
|
28
28
|
WordTimestamps,
|
29
|
+
PhonemeTimestamps,
|
29
30
|
)
|
30
31
|
|
31
32
|
from ..core.pydantic_utilities import parse_obj_as
|
@@ -58,7 +59,7 @@ class _TTSContext:
|
|
58
59
|
self,
|
59
60
|
*,
|
60
61
|
model_id: str,
|
61
|
-
transcript: str,
|
62
|
+
transcript: typing.Generator[str, None, None],
|
62
63
|
output_format: OutputFormatParams,
|
63
64
|
voice: TtsRequestVoiceSpecifierParams,
|
64
65
|
context_id: Optional[str] = None,
|
@@ -235,7 +236,7 @@ class TtsWebsocket:
|
|
235
236
|
Usage:
|
236
237
|
>>> ws = client.tts.websocket()
|
237
238
|
>>> generation_request = GenerationRequest(
|
238
|
-
... model_id="sonic-
|
239
|
+
... model_id="sonic-2",
|
239
240
|
... transcript="Hello world!",
|
240
241
|
... voice_embedding=embedding
|
241
242
|
... output_format={"container": "raw", "encoding": "pcm_f32le", "sample_rate": 44100}
|
@@ -281,7 +282,26 @@ class TtsWebsocket:
|
|
281
282
|
f"{self.ws_url}/{route}?api_key={self.api_key}&cartesia_version={self.cartesia_version}"
|
282
283
|
)
|
283
284
|
except Exception as e:
|
284
|
-
|
285
|
+
# Extract status code if available
|
286
|
+
status_code = None
|
287
|
+
error_message = str(e)
|
288
|
+
|
289
|
+
if hasattr(e, 'status') and e.status is not None:
|
290
|
+
status_code = e.status
|
291
|
+
|
292
|
+
# Create a meaningful error message based on status code
|
293
|
+
if status_code == 402:
|
294
|
+
error_message = "Payment required. Your API key may have insufficient credits or permissions."
|
295
|
+
elif status_code == 401:
|
296
|
+
error_message = "Unauthorized. Please check your API key."
|
297
|
+
elif status_code == 403:
|
298
|
+
error_message = "Forbidden. You don't have permission to access this resource."
|
299
|
+
elif status_code == 404:
|
300
|
+
error_message = "Not found. The requested resource doesn't exist."
|
301
|
+
|
302
|
+
raise RuntimeError(f"Failed to connect to WebSocket.\nStatus: {status_code}. Error message: {error_message}")
|
303
|
+
else:
|
304
|
+
raise RuntimeError(f"Failed to connect to WebSocket. {e}")
|
285
305
|
|
286
306
|
def _is_websocket_closed(self):
|
287
307
|
return self.websocket.socket.fileno() == -1
|
@@ -310,6 +330,8 @@ class TtsWebsocket:
|
|
310
330
|
out["audio"] = base64.b64decode(response.data)
|
311
331
|
elif isinstance(response, WebSocketResponse_Timestamps):
|
312
332
|
out["word_timestamps"] = response.word_timestamps # type: ignore
|
333
|
+
elif isinstance(response, WebSocketResponse_PhonemeTimestamps):
|
334
|
+
out["phoneme_timestamps"] = response.phoneme_timestamps # type: ignore
|
313
335
|
elif include_flush_id and isinstance(response, WebSocketResponse_FlushDone):
|
314
336
|
out["flush_done"] = response.flush_done # type: ignore
|
315
337
|
out["flush_id"] = response.flush_id # type: ignore
|
@@ -331,6 +353,7 @@ class TtsWebsocket:
|
|
331
353
|
language: Optional[str] = None,
|
332
354
|
stream: bool = True,
|
333
355
|
add_timestamps: bool = False,
|
356
|
+
add_phoneme_timestamps: bool = False,
|
334
357
|
):
|
335
358
|
"""Send a request to the WebSocket to generate audio.
|
336
359
|
|
@@ -360,6 +383,7 @@ class TtsWebsocket:
|
|
360
383
|
"language": language,
|
361
384
|
"stream": stream,
|
362
385
|
"add_timestamps": add_timestamps,
|
386
|
+
"add_phoneme_timestamps": add_phoneme_timestamps,
|
363
387
|
}
|
364
388
|
generator = self._websocket_generator(request_body)
|
365
389
|
|
@@ -370,6 +394,9 @@ class TtsWebsocket:
|
|
370
394
|
words: typing.List[str] = []
|
371
395
|
start: typing.List[float] = []
|
372
396
|
end: typing.List[float] = []
|
397
|
+
phonemes: typing.List[str] = []
|
398
|
+
phoneme_start: typing.List[float] = []
|
399
|
+
phoneme_end: typing.List[float] = []
|
373
400
|
for chunk in generator:
|
374
401
|
if chunk.audio is not None:
|
375
402
|
chunks.append(chunk.audio)
|
@@ -378,6 +405,11 @@ class TtsWebsocket:
|
|
378
405
|
words.extend(chunk.word_timestamps.words)
|
379
406
|
start.extend(chunk.word_timestamps.start)
|
380
407
|
end.extend(chunk.word_timestamps.end)
|
408
|
+
if add_phoneme_timestamps and chunk.phoneme_timestamps is not None:
|
409
|
+
if chunk.phoneme_timestamps is not None:
|
410
|
+
phonemes.extend(chunk.phoneme_timestamps.phonemes)
|
411
|
+
phoneme_start.extend(chunk.phoneme_timestamps.start)
|
412
|
+
phoneme_end.extend(chunk.phoneme_timestamps.end)
|
381
413
|
|
382
414
|
return WebSocketTtsOutput(
|
383
415
|
audio=b"".join(chunks), # type: ignore
|
@@ -391,6 +423,15 @@ class TtsWebsocket:
|
|
391
423
|
if add_timestamps
|
392
424
|
else None
|
393
425
|
),
|
426
|
+
phoneme_timestamps=(
|
427
|
+
PhonemeTimestamps(
|
428
|
+
phonemes=phonemes,
|
429
|
+
start=phoneme_start,
|
430
|
+
end=phoneme_end,
|
431
|
+
)
|
432
|
+
if add_phoneme_timestamps
|
433
|
+
else None
|
434
|
+
),
|
394
435
|
)
|
395
436
|
|
396
437
|
def _websocket_generator(self, request_body: Dict[str, Any]):
|
cartesia/tts/client.py
CHANGED
@@ -67,7 +67,7 @@ class TtsClient:
|
|
67
67
|
api_key="YOUR_API_KEY",
|
68
68
|
)
|
69
69
|
client.tts.bytes(
|
70
|
-
model_id="sonic",
|
70
|
+
model_id="sonic-2",
|
71
71
|
transcript="Hello, world!",
|
72
72
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
73
73
|
language="en",
|
@@ -152,7 +152,7 @@ class TtsClient:
|
|
152
152
|
api_key="YOUR_API_KEY",
|
153
153
|
)
|
154
154
|
response = client.tts.sse(
|
155
|
-
model_id="sonic",
|
155
|
+
model_id="sonic-2",
|
156
156
|
transcript="Hello, world!",
|
157
157
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
158
158
|
language="en",
|
@@ -258,7 +258,7 @@ class AsyncTtsClient:
|
|
258
258
|
|
259
259
|
async def main() -> None:
|
260
260
|
await client.tts.bytes(
|
261
|
-
model_id="sonic",
|
261
|
+
model_id="sonic-2",
|
262
262
|
transcript="Hello, world!",
|
263
263
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
264
264
|
language="en",
|
@@ -351,7 +351,7 @@ class AsyncTtsClient:
|
|
351
351
|
|
352
352
|
async def main() -> None:
|
353
353
|
response = await client.tts.sse(
|
354
|
-
model_id="sonic",
|
354
|
+
model_id="sonic-2",
|
355
355
|
transcript="Hello, world!",
|
356
356
|
voice={"mode": "id", "id": "694f9389-aac1-45b6-b726-9d9369183238"},
|
357
357
|
language="en",
|
@@ -1,8 +1,11 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponseParams
|
4
|
+
import typing_extensions
|
5
|
+
from ..types.flush_id import FlushId
|
4
6
|
|
5
7
|
|
6
8
|
class WebSocketChunkResponseParams(WebSocketBaseResponseParams):
|
7
9
|
data: str
|
8
10
|
step_time: float
|
11
|
+
flush_id: typing_extensions.NotRequired[FlushId]
|
@@ -4,8 +4,8 @@ from __future__ import annotations
|
|
4
4
|
import typing_extensions
|
5
5
|
import typing
|
6
6
|
import typing_extensions
|
7
|
-
from ..types.context_id import ContextId
|
8
7
|
from ..types.flush_id import FlushId
|
8
|
+
from ..types.context_id import ContextId
|
9
9
|
from .word_timestamps import WordTimestampsParams
|
10
10
|
from .phoneme_timestamps import PhonemeTimestampsParams
|
11
11
|
|
@@ -14,6 +14,7 @@ class WebSocketResponse_ChunkParams(typing_extensions.TypedDict):
|
|
14
14
|
type: typing.Literal["chunk"]
|
15
15
|
data: str
|
16
16
|
step_time: float
|
17
|
+
flush_id: typing_extensions.NotRequired[FlushId]
|
17
18
|
context_id: typing_extensions.NotRequired[ContextId]
|
18
19
|
status_code: int
|
19
20
|
done: bool
|
@@ -19,6 +19,7 @@ class WebSocketTtsRequestParams(typing_extensions.TypedDict):
|
|
19
19
|
duration: typing_extensions.NotRequired[int]
|
20
20
|
language: typing_extensions.NotRequired[str]
|
21
21
|
add_timestamps: typing_extensions.NotRequired[bool]
|
22
|
+
use_original_timestamps: typing_extensions.NotRequired[bool]
|
22
23
|
add_phoneme_timestamps: typing_extensions.NotRequired[bool]
|
23
24
|
continue_: typing_extensions.NotRequired[typing_extensions.Annotated[bool, FieldMetadata(alias="continue")]]
|
24
25
|
context_id: typing_extensions.NotRequired[str]
|
cartesia/tts/types/emotion.py
CHANGED
@@ -6,22 +6,27 @@ Emotion = typing.Union[
|
|
6
6
|
typing.Literal[
|
7
7
|
"anger:lowest",
|
8
8
|
"anger:low",
|
9
|
+
"anger",
|
9
10
|
"anger:high",
|
10
11
|
"anger:highest",
|
11
12
|
"positivity:lowest",
|
12
13
|
"positivity:low",
|
14
|
+
"positivity",
|
13
15
|
"positivity:high",
|
14
16
|
"positivity:highest",
|
15
17
|
"surprise:lowest",
|
16
18
|
"surprise:low",
|
19
|
+
"surprise",
|
17
20
|
"surprise:high",
|
18
21
|
"surprise:highest",
|
19
22
|
"sadness:lowest",
|
20
23
|
"sadness:low",
|
24
|
+
"sadness",
|
21
25
|
"sadness:high",
|
22
26
|
"sadness:highest",
|
23
27
|
"curiosity:lowest",
|
24
28
|
"curiosity:low",
|
29
|
+
"curiosity",
|
25
30
|
"curiosity:high",
|
26
31
|
"curiosity:highest",
|
27
32
|
],
|
@@ -56,6 +56,11 @@ class GenerationRequest(UniversalBaseModel):
|
|
56
56
|
Whether to return phoneme-level timestamps.
|
57
57
|
"""
|
58
58
|
|
59
|
+
use_original_timestamps: typing.Optional[bool] = pydantic.Field(default=None)
|
60
|
+
"""
|
61
|
+
Whether to use the original transcript for timestamps.
|
62
|
+
"""
|
63
|
+
|
59
64
|
if IS_PYDANTIC_V2:
|
60
65
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
61
66
|
else:
|
@@ -1,14 +1,16 @@
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
2
2
|
|
3
3
|
from .web_socket_base_response import WebSocketBaseResponse
|
4
|
-
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
5
4
|
import typing
|
5
|
+
from .flush_id import FlushId
|
6
|
+
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
6
7
|
import pydantic
|
7
8
|
|
8
9
|
|
9
10
|
class WebSocketChunkResponse(WebSocketBaseResponse):
|
10
11
|
data: str
|
11
12
|
step_time: float
|
13
|
+
flush_id: typing.Optional[FlushId] = None
|
12
14
|
|
13
15
|
if IS_PYDANTIC_V2:
|
14
16
|
model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
|
@@ -3,10 +3,10 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
from ...core.pydantic_utilities import UniversalBaseModel
|
5
5
|
import typing
|
6
|
+
from .flush_id import FlushId
|
6
7
|
from .context_id import ContextId
|
7
8
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2
|
8
9
|
import pydantic
|
9
|
-
from .flush_id import FlushId
|
10
10
|
from .word_timestamps import WordTimestamps
|
11
11
|
from .phoneme_timestamps import PhonemeTimestamps
|
12
12
|
|
@@ -15,6 +15,7 @@ class WebSocketResponse_Chunk(UniversalBaseModel):
|
|
15
15
|
type: typing.Literal["chunk"] = "chunk"
|
16
16
|
data: str
|
17
17
|
step_time: float
|
18
|
+
flush_id: typing.Optional[FlushId] = None
|
18
19
|
context_id: typing.Optional[ContextId] = None
|
19
20
|
status_code: int
|
20
21
|
done: bool
|
@@ -7,11 +7,13 @@ import pydantic
|
|
7
7
|
from ...core.pydantic_utilities import IS_PYDANTIC_V2, UniversalBaseModel
|
8
8
|
from .context_id import ContextId
|
9
9
|
from .flush_id import FlushId
|
10
|
+
from .phoneme_timestamps import PhonemeTimestamps
|
10
11
|
from .word_timestamps import WordTimestamps
|
11
12
|
|
12
13
|
|
13
14
|
class WebSocketTtsOutput(UniversalBaseModel):
|
14
15
|
word_timestamps: typing.Optional[WordTimestamps] = None
|
16
|
+
phoneme_timestamps: typing.Optional[PhonemeTimestamps] = None
|
15
17
|
audio: typing.Optional[bytes] = None
|
16
18
|
context_id: typing.Optional[ContextId] = None
|
17
19
|
flush_id: typing.Optional[FlushId] = None
|
@@ -22,6 +22,7 @@ class WebSocketTtsRequest(UniversalBaseModel):
|
|
22
22
|
duration: typing.Optional[int] = None
|
23
23
|
language: typing.Optional[str] = None
|
24
24
|
add_timestamps: typing.Optional[bool] = None
|
25
|
+
use_original_timestamps: typing.Optional[bool] = None
|
25
26
|
add_phoneme_timestamps: typing.Optional[bool] = None
|
26
27
|
continue_: typing_extensions.Annotated[typing.Optional[bool], FieldMetadata(alias="continue")] = None
|
27
28
|
context_id: typing.Optional[str] = None
|