cartesia 1.0.5__py2.py3-none-any.whl → 1.0.6__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/_types.py +24 -0
- cartesia/client.py +93 -73
- cartesia/version.py +1 -1
- {cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/METADATA +1 -1
- cartesia-1.0.6.dist-info/RECORD +12 -0
- cartesia-1.0.5.dist-info/RECORD +0 -12
- {cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/LICENSE.md +0 -0
- {cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/WHEEL +0 -0
- {cartesia-1.0.5.dist-info → cartesia-1.0.6.dist-info}/top_level.txt +0 -0
cartesia/_types.py
CHANGED
@@ -70,7 +70,31 @@ class VoiceMetadata(TypedDict):
|
|
70
70
|
language: str
|
71
71
|
|
72
72
|
|
73
|
+
class VoiceControls(TypedDict):
|
74
|
+
"""Defines different voice control parameters for voice synthesis.
|
75
|
+
|
76
|
+
For a complete list of supported parameters, refer to the Cartesia API documentation.
|
77
|
+
https://docs.cartesia.ai/getting-started/welcome
|
78
|
+
|
79
|
+
Examples:
|
80
|
+
>>> {"speed": "fastest"}
|
81
|
+
>>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
|
82
|
+
>>> {"emotion": "surprise:high, positivity:high"}
|
83
|
+
|
84
|
+
Note:
|
85
|
+
This is an experimental class and is subject to rapid change in future versions.
|
86
|
+
"""
|
87
|
+
speed: str = ""
|
88
|
+
emotion: str = ""
|
89
|
+
|
90
|
+
|
73
91
|
class OutputFormat(TypedDict):
|
74
92
|
container: str
|
75
93
|
encoding: str
|
76
94
|
sample_rate: int
|
95
|
+
|
96
|
+
|
97
|
+
class EventType:
|
98
|
+
NULL = ""
|
99
|
+
AUDIO = "chunk"
|
100
|
+
TIMESTAMPS = "timestamps"
|
cartesia/client.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
from collections import defaultdict
|
3
4
|
import json
|
4
5
|
import os
|
5
6
|
import uuid
|
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
|
|
27
28
|
|
28
29
|
from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
|
29
30
|
from cartesia._types import (
|
31
|
+
EventType,
|
30
32
|
OutputFormat,
|
31
33
|
OutputFormatMapping,
|
32
34
|
DeprecatedOutputFormatMapping,
|
35
|
+
VoiceControls,
|
33
36
|
VoiceMetadata,
|
34
37
|
)
|
35
38
|
|
@@ -295,6 +298,7 @@ class _TTSContext:
|
|
295
298
|
context_id: Optional[str] = None,
|
296
299
|
duration: Optional[int] = None,
|
297
300
|
language: Optional[str] = None,
|
301
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
298
302
|
) -> Generator[bytes, None, None]:
|
299
303
|
"""Send audio generation requests to the WebSocket and yield responses.
|
300
304
|
|
@@ -307,6 +311,8 @@ class _TTSContext:
|
|
307
311
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
308
312
|
duration: The duration of the audio in seconds.
|
309
313
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
314
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
315
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
310
316
|
|
311
317
|
Yields:
|
312
318
|
Dictionary containing the following key(s):
|
@@ -322,7 +328,7 @@ class _TTSContext:
|
|
322
328
|
|
323
329
|
self._websocket.connect()
|
324
330
|
|
325
|
-
voice =
|
331
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
|
326
332
|
|
327
333
|
# Create the initial request body
|
328
334
|
request_body = {
|
@@ -482,42 +488,16 @@ class _WebSocket:
|
|
482
488
|
def _convert_response(
|
483
489
|
self, response: Dict[str, any], include_context_id: bool
|
484
490
|
) -> Dict[str, Any]:
|
485
|
-
|
486
|
-
|
487
|
-
|
491
|
+
out = {}
|
492
|
+
if response["type"] == EventType.AUDIO:
|
493
|
+
out["audio"] = base64.b64decode(response["data"])
|
494
|
+
elif response["type"] == EventType.TIMESTAMPS:
|
495
|
+
out["word_timestamps"] = response["word_timestamps"]
|
496
|
+
|
488
497
|
if include_context_id:
|
489
|
-
|
490
|
-
|
491
|
-
return {
|
492
|
-
"audio": audio,
|
493
|
-
**optional_kwargs,
|
494
|
-
}
|
495
|
-
|
496
|
-
def _validate_and_construct_voice(
|
497
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
498
|
-
) -> dict:
|
499
|
-
"""Validate and construct the voice dictionary for the request.
|
500
|
-
|
501
|
-
Args:
|
502
|
-
voice_id: The ID of the voice to use for generating audio.
|
503
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
504
|
-
|
505
|
-
Returns:
|
506
|
-
A dictionary representing the voice configuration.
|
498
|
+
out["context_id"] = response["context_id"]
|
507
499
|
|
508
|
-
|
509
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
510
|
-
"""
|
511
|
-
if voice_id is None and voice_embedding is None:
|
512
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
513
|
-
|
514
|
-
if voice_id is not None and voice_embedding is not None:
|
515
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
516
|
-
|
517
|
-
if voice_id:
|
518
|
-
return {"mode": "id", "id": voice_id}
|
519
|
-
|
520
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
500
|
+
return out
|
521
501
|
|
522
502
|
def send(
|
523
503
|
self,
|
@@ -530,6 +510,8 @@ class _WebSocket:
|
|
530
510
|
duration: Optional[int] = None,
|
531
511
|
language: Optional[str] = None,
|
532
512
|
stream: bool = True,
|
513
|
+
add_timestamps: bool = False,
|
514
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
533
515
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
534
516
|
"""Send a request to the WebSocket to generate audio.
|
535
517
|
|
@@ -543,6 +525,9 @@ class _WebSocket:
|
|
543
525
|
duration: The duration of the audio in seconds.
|
544
526
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
545
527
|
stream: Whether to stream the audio or not.
|
528
|
+
add_timestamps: Whether to return word-level timestamps.
|
529
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
530
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
546
531
|
|
547
532
|
Returns:
|
548
533
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -556,7 +541,7 @@ class _WebSocket:
|
|
556
541
|
if context_id is None:
|
557
542
|
context_id = str(uuid.uuid4())
|
558
543
|
|
559
|
-
voice =
|
544
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
|
560
545
|
|
561
546
|
request_body = {
|
562
547
|
"model_id": model_id,
|
@@ -569,6 +554,7 @@ class _WebSocket:
|
|
569
554
|
},
|
570
555
|
"context_id": context_id,
|
571
556
|
"language": language,
|
557
|
+
"add_timestamps": add_timestamps,
|
572
558
|
}
|
573
559
|
|
574
560
|
if duration is not None:
|
@@ -580,10 +566,17 @@ class _WebSocket:
|
|
580
566
|
return generator
|
581
567
|
|
582
568
|
chunks = []
|
569
|
+
word_timestamps = defaultdict(list)
|
583
570
|
for chunk in generator:
|
584
|
-
|
585
|
-
|
586
|
-
|
571
|
+
if "audio" in chunk:
|
572
|
+
chunks.append(chunk["audio"])
|
573
|
+
if add_timestamps and "word_timestamps" in chunk:
|
574
|
+
for k, v in chunk["word_timestamps"].items():
|
575
|
+
word_timestamps[k].extend(v)
|
576
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
577
|
+
if add_timestamps:
|
578
|
+
out["word_timestamps"] = word_timestamps
|
579
|
+
return out
|
587
580
|
|
588
581
|
def _websocket_generator(self, request_body: Dict[str, Any]):
|
589
582
|
self.websocket.send(json.dumps(request_body))
|
@@ -656,32 +649,6 @@ class _SSE:
|
|
656
649
|
break
|
657
650
|
return buffer, outputs
|
658
651
|
|
659
|
-
def _validate_and_construct_voice(
|
660
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
661
|
-
) -> dict:
|
662
|
-
"""Validate and construct the voice dictionary for the request.
|
663
|
-
|
664
|
-
Args:
|
665
|
-
voice_id: The ID of the voice to use for generating audio.
|
666
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
667
|
-
|
668
|
-
Returns:
|
669
|
-
A dictionary representing the voice configuration.
|
670
|
-
|
671
|
-
Raises:
|
672
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
673
|
-
"""
|
674
|
-
if voice_id is None and voice_embedding is None:
|
675
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
676
|
-
|
677
|
-
if voice_id is not None and voice_embedding is not None:
|
678
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
679
|
-
|
680
|
-
if voice_id:
|
681
|
-
return {"mode": "id", "id": voice_id}
|
682
|
-
|
683
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
684
|
-
|
685
652
|
def send(
|
686
653
|
self,
|
687
654
|
model_id: str,
|
@@ -692,6 +659,7 @@ class _SSE:
|
|
692
659
|
duration: Optional[int] = None,
|
693
660
|
language: Optional[str] = None,
|
694
661
|
stream: bool = True,
|
662
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
695
663
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
696
664
|
"""Send a request to the server to generate audio using Server-Sent Events.
|
697
665
|
|
@@ -704,6 +672,8 @@ class _SSE:
|
|
704
672
|
duration: The duration of the audio in seconds.
|
705
673
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
706
674
|
stream: Whether to stream the audio or not.
|
675
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
676
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
707
677
|
|
708
678
|
Returns:
|
709
679
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -711,8 +681,7 @@ class _SSE:
|
|
711
681
|
Both the generator and the dictionary contain the following key(s):
|
712
682
|
- audio: The audio as bytes.
|
713
683
|
"""
|
714
|
-
voice =
|
715
|
-
|
684
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
|
716
685
|
request_body = {
|
717
686
|
"model_id": model_id,
|
718
687
|
"transcript": transcript,
|
@@ -946,8 +915,9 @@ class _AsyncSSE(_SSE):
|
|
946
915
|
duration: Optional[int] = None,
|
947
916
|
language: Optional[str] = None,
|
948
917
|
stream: bool = True,
|
918
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
949
919
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
950
|
-
voice =
|
920
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
|
951
921
|
|
952
922
|
request_body = {
|
953
923
|
"model_id": model_id,
|
@@ -1043,6 +1013,8 @@ class _AsyncTTSContext:
|
|
1043
1013
|
continue_: bool = False,
|
1044
1014
|
duration: Optional[int] = None,
|
1045
1015
|
language: Optional[str] = None,
|
1016
|
+
add_timestamps: bool = False,
|
1017
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1046
1018
|
) -> None:
|
1047
1019
|
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
1048
1020
|
|
@@ -1055,7 +1027,10 @@ class _AsyncTTSContext:
|
|
1055
1027
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
1056
1028
|
continue_: Whether to continue the audio generation from the previous transcript or not.
|
1057
1029
|
duration: The duration of the audio in seconds.
|
1058
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual
|
1030
|
+
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
|
1031
|
+
add_timestamps: Whether to return word-level timestamps.
|
1032
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
1033
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
1059
1034
|
|
1060
1035
|
Returns:
|
1061
1036
|
None.
|
@@ -1067,7 +1042,7 @@ class _AsyncTTSContext:
|
|
1067
1042
|
|
1068
1043
|
await self._websocket.connect()
|
1069
1044
|
|
1070
|
-
voice =
|
1045
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
|
1071
1046
|
|
1072
1047
|
request_body = {
|
1073
1048
|
"model_id": model_id,
|
@@ -1081,6 +1056,7 @@ class _AsyncTTSContext:
|
|
1081
1056
|
"context_id": self._context_id,
|
1082
1057
|
"continue": continue_,
|
1083
1058
|
"language": language,
|
1059
|
+
"add_timestamps": add_timestamps,
|
1084
1060
|
}
|
1085
1061
|
|
1086
1062
|
if duration is not None:
|
@@ -1234,7 +1210,10 @@ class _AsyncWebSocket(_WebSocket):
|
|
1234
1210
|
duration: Optional[int] = None,
|
1235
1211
|
language: Optional[str] = None,
|
1236
1212
|
stream: bool = True,
|
1213
|
+
add_timestamps: bool = False,
|
1214
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1237
1215
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
1216
|
+
"""See :meth:`_WebSocket.send` for details."""
|
1238
1217
|
if context_id is None:
|
1239
1218
|
context_id = str(uuid.uuid4())
|
1240
1219
|
|
@@ -1250,6 +1229,8 @@ class _AsyncWebSocket(_WebSocket):
|
|
1250
1229
|
duration=duration,
|
1251
1230
|
language=language,
|
1252
1231
|
continue_=False,
|
1232
|
+
add_timestamps = add_timestamps,
|
1233
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
1253
1234
|
)
|
1254
1235
|
|
1255
1236
|
generator = ctx.receive()
|
@@ -1258,10 +1239,17 @@ class _AsyncWebSocket(_WebSocket):
|
|
1258
1239
|
return generator
|
1259
1240
|
|
1260
1241
|
chunks = []
|
1242
|
+
word_timestamps = defaultdict(list)
|
1261
1243
|
async for chunk in generator:
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1244
|
+
if "audio" in chunk:
|
1245
|
+
chunks.append(chunk["audio"])
|
1246
|
+
if add_timestamps and "word_timestamps" in chunk:
|
1247
|
+
for k, v in chunk["word_timestamps"].items():
|
1248
|
+
word_timestamps[k].extend(v)
|
1249
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
1250
|
+
if add_timestamps:
|
1251
|
+
out["word_timestamps"] = word_timestamps
|
1252
|
+
return out
|
1265
1253
|
|
1266
1254
|
async def _process_responses(self):
|
1267
1255
|
try:
|
@@ -1311,3 +1299,35 @@ class AsyncTTS(TTS):
|
|
1311
1299
|
)
|
1312
1300
|
await ws.connect()
|
1313
1301
|
return ws
|
1302
|
+
|
1303
|
+
|
1304
|
+
def _validate_and_construct_voice(
|
1305
|
+
voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
|
1306
|
+
) -> dict:
|
1307
|
+
"""Validate and construct the voice dictionary for the request.
|
1308
|
+
|
1309
|
+
Args:
|
1310
|
+
voice_id: The ID of the voice to use for generating audio.
|
1311
|
+
voice_embedding: The embedding of the voice to use for generating audio.
|
1312
|
+
experimental_voice_controls: Voice controls for emotion and speed.
|
1313
|
+
Note: This is an experimental feature and may rapidly change in the future.
|
1314
|
+
|
1315
|
+
Returns:
|
1316
|
+
A dictionary representing the voice configuration.
|
1317
|
+
|
1318
|
+
Raises:
|
1319
|
+
ValueError: If neither or both voice_id and voice_embedding are specified.
|
1320
|
+
"""
|
1321
|
+
if voice_id is None and voice_embedding is None:
|
1322
|
+
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
1323
|
+
|
1324
|
+
if voice_id is not None and voice_embedding is not None:
|
1325
|
+
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
1326
|
+
|
1327
|
+
if voice_id:
|
1328
|
+
voice = {"mode": "id", "id": voice_id}
|
1329
|
+
else:
|
1330
|
+
voice = {"mode": "embedding", "embedding": voice_embedding}
|
1331
|
+
if experimental_voice_controls is not None:
|
1332
|
+
voice["__experimental_controls"] = experimental_voice_controls
|
1333
|
+
return voice
|
cartesia/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.0.
|
1
|
+
__version__ = "1.0.6"
|
@@ -0,0 +1,12 @@
|
|
1
|
+
cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
|
2
|
+
cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
|
3
|
+
cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
|
4
|
+
cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
|
5
|
+
cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
|
7
|
+
cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
|
8
|
+
cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
|
9
|
+
cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
|
10
|
+
cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
|
11
|
+
cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
12
|
+
cartesia-1.0.6.dist-info/RECORD,,
|
cartesia-1.0.5.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
|
2
|
-
cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
|
3
|
-
cartesia/client.py,sha256=46XiKTXa0gBXJ_GftMtLHAzBoX0GmWz_aWYuG68jaNQ,49316
|
4
|
-
cartesia/version.py,sha256=B9kKWJLln1i8LjtkcYecvNWGLTrez4gCUOHtnPlInFo,22
|
5
|
-
cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
|
7
|
-
cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
|
8
|
-
cartesia-1.0.5.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
|
9
|
-
cartesia-1.0.5.dist-info/METADATA,sha256=PImHYCNoo7iSnm3Br6PuRdqvli92c7AyXR4iagdv-d8,18368
|
10
|
-
cartesia-1.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
|
11
|
-
cartesia-1.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
12
|
-
cartesia-1.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|