cartesia 1.0.5__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cartesia-1.0.5 → cartesia-1.0.6}/PKG-INFO +1 -1
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia/_types.py +24 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia/client.py +93 -73
- cartesia-1.0.6/cartesia/version.py +1 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia.egg-info/PKG-INFO +1 -1
- {cartesia-1.0.5 → cartesia-1.0.6}/tests/test_tts.py +97 -14
- cartesia-1.0.5/cartesia/version.py +0 -1
- {cartesia-1.0.5 → cartesia-1.0.6}/LICENSE.md +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/README.md +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia/__init__.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia/utils/__init__.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia/utils/deprecated.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia/utils/retry.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia.egg-info/SOURCES.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia.egg-info/dependency_links.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia.egg-info/requires.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/cartesia.egg-info/top_level.txt +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/pyproject.toml +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/setup.cfg +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/setup.py +0 -0
- {cartesia-1.0.5 → cartesia-1.0.6}/tests/test_deprecated.py +0 -0
@@ -70,7 +70,31 @@ class VoiceMetadata(TypedDict):
|
|
70
70
|
language: str
|
71
71
|
|
72
72
|
|
73
|
+
class VoiceControls(TypedDict):
|
74
|
+
"""Defines different voice control parameters for voice synthesis.
|
75
|
+
|
76
|
+
For a complete list of supported parameters, refer to the Cartesia API documentation.
|
77
|
+
https://docs.cartesia.ai/getting-started/welcome
|
78
|
+
|
79
|
+
Examples:
|
80
|
+
>>> {"speed": "fastest"}
|
81
|
+
>>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
|
82
|
+
>>> {"emotion": "surprise:high, positivity:high"}
|
83
|
+
|
84
|
+
Note:
|
85
|
+
This is an experimental class and is subject to rapid change in future versions.
|
86
|
+
"""
|
87
|
+
speed: str = ""
|
88
|
+
emotion: str = ""
|
89
|
+
|
90
|
+
|
73
91
|
class OutputFormat(TypedDict):
|
74
92
|
container: str
|
75
93
|
encoding: str
|
76
94
|
sample_rate: int
|
95
|
+
|
96
|
+
|
97
|
+
class EventType:
|
98
|
+
NULL = ""
|
99
|
+
AUDIO = "chunk"
|
100
|
+
TIMESTAMPS = "timestamps"
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
from collections import defaultdict
|
3
4
|
import json
|
4
5
|
import os
|
5
6
|
import uuid
|
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
|
|
27
28
|
|
28
29
|
from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
|
29
30
|
from cartesia._types import (
|
31
|
+
EventType,
|
30
32
|
OutputFormat,
|
31
33
|
OutputFormatMapping,
|
32
34
|
DeprecatedOutputFormatMapping,
|
35
|
+
VoiceControls,
|
33
36
|
VoiceMetadata,
|
34
37
|
)
|
35
38
|
|
@@ -295,6 +298,7 @@ class _TTSContext:
|
|
295
298
|
context_id: Optional[str] = None,
|
296
299
|
duration: Optional[int] = None,
|
297
300
|
language: Optional[str] = None,
|
301
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
298
302
|
) -> Generator[bytes, None, None]:
|
299
303
|
"""Send audio generation requests to the WebSocket and yield responses.
|
300
304
|
|
@@ -307,6 +311,8 @@ class _TTSContext:
|
|
307
311
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
308
312
|
duration: The duration of the audio in seconds.
|
309
313
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
314
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
315
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
310
316
|
|
311
317
|
Yields:
|
312
318
|
Dictionary containing the following key(s):
|
@@ -322,7 +328,7 @@ class _TTSContext:
|
|
322
328
|
|
323
329
|
self._websocket.connect()
|
324
330
|
|
325
|
-
voice =
|
331
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
|
326
332
|
|
327
333
|
# Create the initial request body
|
328
334
|
request_body = {
|
@@ -482,42 +488,16 @@ class _WebSocket:
|
|
482
488
|
def _convert_response(
|
483
489
|
self, response: Dict[str, any], include_context_id: bool
|
484
490
|
) -> Dict[str, Any]:
|
485
|
-
|
486
|
-
|
487
|
-
|
491
|
+
out = {}
|
492
|
+
if response["type"] == EventType.AUDIO:
|
493
|
+
out["audio"] = base64.b64decode(response["data"])
|
494
|
+
elif response["type"] == EventType.TIMESTAMPS:
|
495
|
+
out["word_timestamps"] = response["word_timestamps"]
|
496
|
+
|
488
497
|
if include_context_id:
|
489
|
-
|
490
|
-
|
491
|
-
return {
|
492
|
-
"audio": audio,
|
493
|
-
**optional_kwargs,
|
494
|
-
}
|
495
|
-
|
496
|
-
def _validate_and_construct_voice(
|
497
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
498
|
-
) -> dict:
|
499
|
-
"""Validate and construct the voice dictionary for the request.
|
500
|
-
|
501
|
-
Args:
|
502
|
-
voice_id: The ID of the voice to use for generating audio.
|
503
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
504
|
-
|
505
|
-
Returns:
|
506
|
-
A dictionary representing the voice configuration.
|
498
|
+
out["context_id"] = response["context_id"]
|
507
499
|
|
508
|
-
|
509
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
510
|
-
"""
|
511
|
-
if voice_id is None and voice_embedding is None:
|
512
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
513
|
-
|
514
|
-
if voice_id is not None and voice_embedding is not None:
|
515
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
516
|
-
|
517
|
-
if voice_id:
|
518
|
-
return {"mode": "id", "id": voice_id}
|
519
|
-
|
520
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
500
|
+
return out
|
521
501
|
|
522
502
|
def send(
|
523
503
|
self,
|
@@ -530,6 +510,8 @@ class _WebSocket:
|
|
530
510
|
duration: Optional[int] = None,
|
531
511
|
language: Optional[str] = None,
|
532
512
|
stream: bool = True,
|
513
|
+
add_timestamps: bool = False,
|
514
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
533
515
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
534
516
|
"""Send a request to the WebSocket to generate audio.
|
535
517
|
|
@@ -543,6 +525,9 @@ class _WebSocket:
|
|
543
525
|
duration: The duration of the audio in seconds.
|
544
526
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
545
527
|
stream: Whether to stream the audio or not.
|
528
|
+
add_timestamps: Whether to return word-level timestamps.
|
529
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
530
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
546
531
|
|
547
532
|
Returns:
|
548
533
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -556,7 +541,7 @@ class _WebSocket:
|
|
556
541
|
if context_id is None:
|
557
542
|
context_id = str(uuid.uuid4())
|
558
543
|
|
559
|
-
voice =
|
544
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
|
560
545
|
|
561
546
|
request_body = {
|
562
547
|
"model_id": model_id,
|
@@ -569,6 +554,7 @@ class _WebSocket:
|
|
569
554
|
},
|
570
555
|
"context_id": context_id,
|
571
556
|
"language": language,
|
557
|
+
"add_timestamps": add_timestamps,
|
572
558
|
}
|
573
559
|
|
574
560
|
if duration is not None:
|
@@ -580,10 +566,17 @@ class _WebSocket:
|
|
580
566
|
return generator
|
581
567
|
|
582
568
|
chunks = []
|
569
|
+
word_timestamps = defaultdict(list)
|
583
570
|
for chunk in generator:
|
584
|
-
|
585
|
-
|
586
|
-
|
571
|
+
if "audio" in chunk:
|
572
|
+
chunks.append(chunk["audio"])
|
573
|
+
if add_timestamps and "word_timestamps" in chunk:
|
574
|
+
for k, v in chunk["word_timestamps"].items():
|
575
|
+
word_timestamps[k].extend(v)
|
576
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
577
|
+
if add_timestamps:
|
578
|
+
out["word_timestamps"] = word_timestamps
|
579
|
+
return out
|
587
580
|
|
588
581
|
def _websocket_generator(self, request_body: Dict[str, Any]):
|
589
582
|
self.websocket.send(json.dumps(request_body))
|
@@ -656,32 +649,6 @@ class _SSE:
|
|
656
649
|
break
|
657
650
|
return buffer, outputs
|
658
651
|
|
659
|
-
def _validate_and_construct_voice(
|
660
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
661
|
-
) -> dict:
|
662
|
-
"""Validate and construct the voice dictionary for the request.
|
663
|
-
|
664
|
-
Args:
|
665
|
-
voice_id: The ID of the voice to use for generating audio.
|
666
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
667
|
-
|
668
|
-
Returns:
|
669
|
-
A dictionary representing the voice configuration.
|
670
|
-
|
671
|
-
Raises:
|
672
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
673
|
-
"""
|
674
|
-
if voice_id is None and voice_embedding is None:
|
675
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
676
|
-
|
677
|
-
if voice_id is not None and voice_embedding is not None:
|
678
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
679
|
-
|
680
|
-
if voice_id:
|
681
|
-
return {"mode": "id", "id": voice_id}
|
682
|
-
|
683
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
684
|
-
|
685
652
|
def send(
|
686
653
|
self,
|
687
654
|
model_id: str,
|
@@ -692,6 +659,7 @@ class _SSE:
|
|
692
659
|
duration: Optional[int] = None,
|
693
660
|
language: Optional[str] = None,
|
694
661
|
stream: bool = True,
|
662
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
695
663
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
696
664
|
"""Send a request to the server to generate audio using Server-Sent Events.
|
697
665
|
|
@@ -704,6 +672,8 @@ class _SSE:
|
|
704
672
|
duration: The duration of the audio in seconds.
|
705
673
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
706
674
|
stream: Whether to stream the audio or not.
|
675
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
676
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
707
677
|
|
708
678
|
Returns:
|
709
679
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -711,8 +681,7 @@ class _SSE:
|
|
711
681
|
Both the generator and the dictionary contain the following key(s):
|
712
682
|
- audio: The audio as bytes.
|
713
683
|
"""
|
714
|
-
voice =
|
715
|
-
|
684
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
|
716
685
|
request_body = {
|
717
686
|
"model_id": model_id,
|
718
687
|
"transcript": transcript,
|
@@ -946,8 +915,9 @@ class _AsyncSSE(_SSE):
|
|
946
915
|
duration: Optional[int] = None,
|
947
916
|
language: Optional[str] = None,
|
948
917
|
stream: bool = True,
|
918
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
949
919
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
950
|
-
voice =
|
920
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
|
951
921
|
|
952
922
|
request_body = {
|
953
923
|
"model_id": model_id,
|
@@ -1043,6 +1013,8 @@ class _AsyncTTSContext:
|
|
1043
1013
|
continue_: bool = False,
|
1044
1014
|
duration: Optional[int] = None,
|
1045
1015
|
language: Optional[str] = None,
|
1016
|
+
add_timestamps: bool = False,
|
1017
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1046
1018
|
) -> None:
|
1047
1019
|
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
1048
1020
|
|
@@ -1055,7 +1027,10 @@ class _AsyncTTSContext:
|
|
1055
1027
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
1056
1028
|
continue_: Whether to continue the audio generation from the previous transcript or not.
|
1057
1029
|
duration: The duration of the audio in seconds.
|
1058
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual
|
1030
|
+
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
|
1031
|
+
add_timestamps: Whether to return word-level timestamps.
|
1032
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
1033
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
1059
1034
|
|
1060
1035
|
Returns:
|
1061
1036
|
None.
|
@@ -1067,7 +1042,7 @@ class _AsyncTTSContext:
|
|
1067
1042
|
|
1068
1043
|
await self._websocket.connect()
|
1069
1044
|
|
1070
|
-
voice =
|
1045
|
+
voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
|
1071
1046
|
|
1072
1047
|
request_body = {
|
1073
1048
|
"model_id": model_id,
|
@@ -1081,6 +1056,7 @@ class _AsyncTTSContext:
|
|
1081
1056
|
"context_id": self._context_id,
|
1082
1057
|
"continue": continue_,
|
1083
1058
|
"language": language,
|
1059
|
+
"add_timestamps": add_timestamps,
|
1084
1060
|
}
|
1085
1061
|
|
1086
1062
|
if duration is not None:
|
@@ -1234,7 +1210,10 @@ class _AsyncWebSocket(_WebSocket):
|
|
1234
1210
|
duration: Optional[int] = None,
|
1235
1211
|
language: Optional[str] = None,
|
1236
1212
|
stream: bool = True,
|
1213
|
+
add_timestamps: bool = False,
|
1214
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1237
1215
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
1216
|
+
"""See :meth:`_WebSocket.send` for details."""
|
1238
1217
|
if context_id is None:
|
1239
1218
|
context_id = str(uuid.uuid4())
|
1240
1219
|
|
@@ -1250,6 +1229,8 @@ class _AsyncWebSocket(_WebSocket):
|
|
1250
1229
|
duration=duration,
|
1251
1230
|
language=language,
|
1252
1231
|
continue_=False,
|
1232
|
+
add_timestamps = add_timestamps,
|
1233
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
1253
1234
|
)
|
1254
1235
|
|
1255
1236
|
generator = ctx.receive()
|
@@ -1258,10 +1239,17 @@ class _AsyncWebSocket(_WebSocket):
|
|
1258
1239
|
return generator
|
1259
1240
|
|
1260
1241
|
chunks = []
|
1242
|
+
word_timestamps = defaultdict(list)
|
1261
1243
|
async for chunk in generator:
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1244
|
+
if "audio" in chunk:
|
1245
|
+
chunks.append(chunk["audio"])
|
1246
|
+
if add_timestamps and "word_timestamps" in chunk:
|
1247
|
+
for k, v in chunk["word_timestamps"].items():
|
1248
|
+
word_timestamps[k].extend(v)
|
1249
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
1250
|
+
if add_timestamps:
|
1251
|
+
out["word_timestamps"] = word_timestamps
|
1252
|
+
return out
|
1265
1253
|
|
1266
1254
|
async def _process_responses(self):
|
1267
1255
|
try:
|
@@ -1311,3 +1299,35 @@ class AsyncTTS(TTS):
|
|
1311
1299
|
)
|
1312
1300
|
await ws.connect()
|
1313
1301
|
return ws
|
1302
|
+
|
1303
|
+
|
1304
|
+
def _validate_and_construct_voice(
|
1305
|
+
voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
|
1306
|
+
) -> dict:
|
1307
|
+
"""Validate and construct the voice dictionary for the request.
|
1308
|
+
|
1309
|
+
Args:
|
1310
|
+
voice_id: The ID of the voice to use for generating audio.
|
1311
|
+
voice_embedding: The embedding of the voice to use for generating audio.
|
1312
|
+
experimental_voice_controls: Voice controls for emotion and speed.
|
1313
|
+
Note: This is an experimental feature and may rapidly change in the future.
|
1314
|
+
|
1315
|
+
Returns:
|
1316
|
+
A dictionary representing the voice configuration.
|
1317
|
+
|
1318
|
+
Raises:
|
1319
|
+
ValueError: If neither or both voice_id and voice_embedding are specified.
|
1320
|
+
"""
|
1321
|
+
if voice_id is None and voice_embedding is None:
|
1322
|
+
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
1323
|
+
|
1324
|
+
if voice_id is not None and voice_embedding is not None:
|
1325
|
+
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
1326
|
+
|
1327
|
+
if voice_id:
|
1328
|
+
voice = {"mode": "id", "id": voice_id}
|
1329
|
+
else:
|
1330
|
+
voice = {"mode": "embedding", "embedding": voice_embedding}
|
1331
|
+
if experimental_voice_controls is not None:
|
1332
|
+
voice["__experimental_controls"] = experimental_voice_controls
|
1333
|
+
return voice
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.0.6"
|
@@ -10,7 +10,7 @@ import os
|
|
10
10
|
import sys
|
11
11
|
from cartesia import AsyncCartesia, Cartesia
|
12
12
|
from cartesia.client import DEFAULT_MODEL_ID, MULTILINGUAL_MODEL_ID
|
13
|
-
from cartesia._types import VoiceMetadata
|
13
|
+
from cartesia._types import VoiceControls, VoiceMetadata
|
14
14
|
from typing import AsyncGenerator, Generator, List
|
15
15
|
import numpy as np
|
16
16
|
import pytest
|
@@ -19,9 +19,11 @@ import asyncio
|
|
19
19
|
|
20
20
|
THISDIR = os.path.dirname(__file__)
|
21
21
|
sys.path.insert(0, os.path.dirname(THISDIR))
|
22
|
+
RESOURCES_DIR = os.path.join(THISDIR, "resources")
|
22
23
|
|
23
24
|
SAMPLE_VOICE = "Newsman"
|
24
25
|
SAMPLE_VOICE_ID = "d46abd1d-2d02-43e8-819f-51fb652c1c61"
|
26
|
+
EXPERIMENTAL_VOICE_CONTROLS = {"emotion": ["anger:high", "positivity:low"], "speed": "fastest"}
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|
@@ -84,7 +86,12 @@ def test_get_voice_from_id(client: Cartesia):
|
|
84
86
|
# cloned_voice_embedding = client.voices.clone(link=url)
|
85
87
|
# assert isinstance(cloned_voice_embedding, list)
|
86
88
|
# assert len(cloned_voice_embedding) == 192
|
87
|
-
|
89
|
+
|
90
|
+
def test_clone_voice_with_file(client: Cartesia):
|
91
|
+
logger.info("Testing voices.clone with file")
|
92
|
+
output = client.voices.clone(filepath=os.path.join(RESOURCES_DIR, "sample-speech-4s.wav"))
|
93
|
+
assert isinstance(output, list)
|
94
|
+
|
88
95
|
def test_create_voice(client: Cartesia):
|
89
96
|
logger.info("Testing voices.create")
|
90
97
|
embedding = np.ones(192).tolist()
|
@@ -96,7 +103,8 @@ def test_create_voice(client: Cartesia):
|
|
96
103
|
assert voice in voices
|
97
104
|
|
98
105
|
@pytest.mark.parametrize("stream", [True, False])
|
99
|
-
|
106
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
107
|
+
def test_sse_send(resources: _Resources, stream: bool, _experimental_voice_controls: VoiceControls):
|
100
108
|
logger.info("Testing SSE send")
|
101
109
|
client = resources.client
|
102
110
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
@@ -105,7 +113,7 @@ def test_sse_send(resources: _Resources, stream: bool):
|
|
105
113
|
"container": "raw",
|
106
114
|
"encoding": "pcm_f32le",
|
107
115
|
"sample_rate": 44100
|
108
|
-
}, stream=stream, model_id=DEFAULT_MODEL_ID)
|
116
|
+
}, stream=stream, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
|
109
117
|
|
110
118
|
if not stream:
|
111
119
|
output_generate = [output_generate]
|
@@ -132,7 +140,8 @@ def test_sse_send_with_model_id(resources: _Resources, stream: bool):
|
|
132
140
|
assert isinstance(out["audio"], bytes)
|
133
141
|
|
134
142
|
@pytest.mark.parametrize("stream", [True, False])
|
135
|
-
|
143
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
144
|
+
def test_websocket_send(resources: _Resources, stream: bool, _experimental_voice_controls: VoiceControls):
|
136
145
|
logger.info("Testing WebSocket send")
|
137
146
|
client = resources.client
|
138
147
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
@@ -143,7 +152,7 @@ def test_websocket_send(resources: _Resources, stream: bool):
|
|
143
152
|
"container": "raw",
|
144
153
|
"encoding": "pcm_f32le",
|
145
154
|
"sample_rate": 44100
|
146
|
-
}, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id)
|
155
|
+
}, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id, _experimental_voice_controls=_experimental_voice_controls)
|
147
156
|
|
148
157
|
if not stream:
|
149
158
|
output_generate = [output_generate]
|
@@ -152,8 +161,37 @@ def test_websocket_send(resources: _Resources, stream: bool):
|
|
152
161
|
assert isinstance(out["audio"], bytes)
|
153
162
|
|
154
163
|
ws.close()
|
164
|
+
|
165
|
+
|
166
|
+
@pytest.mark.parametrize("stream", [True, False])
|
167
|
+
def test_websocket_send_timestamps(resources: _Resources, stream: bool):
|
168
|
+
logger.info("Testing WebSocket send")
|
169
|
+
client = resources.client
|
170
|
+
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
171
|
+
|
172
|
+
ws = client.tts.websocket()
|
173
|
+
context_id = str(uuid.uuid4())
|
174
|
+
output_generate = ws.send(transcript=transcript, voice_id=SAMPLE_VOICE_ID, output_format={
|
175
|
+
"container": "raw",
|
176
|
+
"encoding": "pcm_f32le",
|
177
|
+
"sample_rate": 44100
|
178
|
+
}, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id, add_timestamps=True)
|
179
|
+
|
180
|
+
if not stream:
|
181
|
+
output_generate = [output_generate]
|
182
|
+
|
183
|
+
has_wordtimestamps = False
|
184
|
+
for out in output_generate:
|
185
|
+
has_wordtimestamps |= "word_timestamps" in out
|
186
|
+
_validate_schema(out)
|
187
|
+
|
188
|
+
assert has_wordtimestamps, "No word timestamps found"
|
189
|
+
|
190
|
+
ws.close()
|
191
|
+
|
155
192
|
|
156
|
-
|
193
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
194
|
+
def test_sse_send_context_manager(resources: _Resources, _experimental_voice_controls: VoiceControls):
|
157
195
|
logger.info("Testing SSE send context manager")
|
158
196
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
159
197
|
|
@@ -162,7 +200,7 @@ def test_sse_send_context_manager(resources: _Resources):
|
|
162
200
|
"container": "raw",
|
163
201
|
"encoding": "pcm_f32le",
|
164
202
|
"sample_rate": 44100
|
165
|
-
}, stream=True, model_id=DEFAULT_MODEL_ID)
|
203
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
|
166
204
|
assert isinstance(output_generate, Generator)
|
167
205
|
|
168
206
|
for out in output_generate:
|
@@ -183,7 +221,7 @@ def test_sse_send_context_manager_with_err():
|
|
183
221
|
raise RuntimeError("Expected error to be thrown")
|
184
222
|
except Exception:
|
185
223
|
pass
|
186
|
-
|
224
|
+
|
187
225
|
def test_websocket_send_context_manager(resources: _Resources):
|
188
226
|
logger.info("Testing WebSocket send context manager")
|
189
227
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
@@ -216,9 +254,10 @@ def test_websocket_send_context_manage_err(resources: _Resources):
|
|
216
254
|
raise RuntimeError("Expected error to be thrown")
|
217
255
|
except Exception:
|
218
256
|
pass
|
219
|
-
|
257
|
+
|
220
258
|
@pytest.mark.asyncio
|
221
|
-
|
259
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
260
|
+
async def test_async_sse_send( resources: _Resources, _experimental_voice_controls: VoiceControls):
|
222
261
|
logger.info("Testing async SSE send")
|
223
262
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
224
263
|
|
@@ -228,7 +267,7 @@ async def test_async_sse_send(resources: _Resources):
|
|
228
267
|
"container": "raw",
|
229
268
|
"encoding": "pcm_f32le",
|
230
269
|
"sample_rate": 44100
|
231
|
-
}, stream=True, model_id=DEFAULT_MODEL_ID)
|
270
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
|
232
271
|
|
233
272
|
async for out in output:
|
234
273
|
assert out.keys() == {"audio"}
|
@@ -238,7 +277,8 @@ async def test_async_sse_send(resources: _Resources):
|
|
238
277
|
await async_client.close()
|
239
278
|
|
240
279
|
@pytest.mark.asyncio
|
241
|
-
|
280
|
+
@pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
|
281
|
+
async def test_async_websocket_send(resources: _Resources, _experimental_voice_controls: VoiceControls):
|
242
282
|
logger.info("Testing async WebSocket send")
|
243
283
|
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
244
284
|
|
@@ -250,7 +290,7 @@ async def test_async_websocket_send(resources: _Resources):
|
|
250
290
|
"container": "raw",
|
251
291
|
"encoding": "pcm_f32le",
|
252
292
|
"sample_rate": 44100,
|
253
|
-
}, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id)
|
293
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id, _experimental_voice_controls=_experimental_voice_controls)
|
254
294
|
|
255
295
|
async for out in output:
|
256
296
|
assert out.keys() == {"audio", "context_id"}
|
@@ -259,7 +299,37 @@ async def test_async_websocket_send(resources: _Resources):
|
|
259
299
|
# Close the websocket
|
260
300
|
await ws.close()
|
261
301
|
await async_client.close()
|
302
|
+
|
303
|
+
|
304
|
+
@pytest.mark.asyncio
|
305
|
+
async def test_async_websocket_send_timestamps(resources: _Resources):
|
306
|
+
logger.info("Testing async WebSocket send with timestamps")
|
307
|
+
transcript = "Hello, world! I'\''m generating audio on Cartesia."
|
308
|
+
|
309
|
+
async_client = create_async_client()
|
310
|
+
ws = await async_client.tts.websocket()
|
311
|
+
context_id = str(uuid.uuid4())
|
312
|
+
try:
|
313
|
+
output = await ws.send(transcript=transcript, voice_id=SAMPLE_VOICE_ID, output_format={
|
314
|
+
"container": "raw",
|
315
|
+
"encoding": "pcm_f32le",
|
316
|
+
"sample_rate": 44100,
|
317
|
+
}, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id, add_timestamps=True)
|
318
|
+
|
319
|
+
has_wordtimestamps = False
|
320
|
+
async for out in output:
|
321
|
+
assert "context_id" in out
|
322
|
+
has_wordtimestamps |= "word_timestamps" in out
|
323
|
+
_validate_schema(out)
|
262
324
|
|
325
|
+
assert has_wordtimestamps, "No word timestamps found"
|
326
|
+
|
327
|
+
finally:
|
328
|
+
# Close the websocket
|
329
|
+
await ws.close()
|
330
|
+
await async_client.close()
|
331
|
+
|
332
|
+
|
263
333
|
@pytest.mark.asyncio
|
264
334
|
async def test_async_sse_send_context_manager(resources: _Resources):
|
265
335
|
logger.info("Testing async SSE send context manager")
|
@@ -766,3 +836,16 @@ def test_websocket_send_with_incorrect_url():
|
|
766
836
|
ws.close()
|
767
837
|
except Exception as e:
|
768
838
|
logger.info("Unexpected error occured: ", e)
|
839
|
+
|
840
|
+
|
841
|
+
def _validate_schema(out):
|
842
|
+
if "audio" in out:
|
843
|
+
assert isinstance(out["audio"], bytes)
|
844
|
+
if "word_timestamps" in out:
|
845
|
+
assert isinstance(out["word_timestamps"], dict)
|
846
|
+
word_timestamps = out["word_timestamps"]
|
847
|
+
|
848
|
+
assert word_timestamps.keys() == {"words", "start", "end"}
|
849
|
+
assert isinstance(word_timestamps["words"], list) and all(isinstance(word, str) for word in word_timestamps["words"])
|
850
|
+
assert isinstance(word_timestamps["start"], list) and all(isinstance(start, (int, float)) for start in word_timestamps["start"])
|
851
|
+
assert isinstance(word_timestamps["end"], list) and all(isinstance(end, (int, float)) for end in word_timestamps["end"])
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.0.5"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|