cartesia 1.0.5__py2.py3-none-any.whl → 1.0.7__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cartesia/_types.py +26 -1
- cartesia/client.py +113 -72
- cartesia/version.py +1 -1
- {cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/METADATA +54 -1
- cartesia-1.0.7.dist-info/RECORD +12 -0
- cartesia-1.0.5.dist-info/RECORD +0 -12
- {cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/LICENSE.md +0 -0
- {cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/WHEEL +0 -0
- {cartesia-1.0.5.dist-info → cartesia-1.0.7.dist-info}/top_level.txt +0 -0
cartesia/_types.py
CHANGED
@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
|
|
45
45
|
"mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
|
46
46
|
"alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
|
47
47
|
}
|
48
|
-
|
48
|
+
|
49
49
|
@classmethod
|
50
50
|
@deprecated(
|
51
51
|
vdeprecated="1.0.1",
|
@@ -70,7 +70,32 @@ class VoiceMetadata(TypedDict):
|
|
70
70
|
language: str
|
71
71
|
|
72
72
|
|
73
|
+
class VoiceControls(TypedDict):
|
74
|
+
"""Defines different voice control parameters for voice synthesis.
|
75
|
+
|
76
|
+
For a complete list of supported parameters, refer to the Cartesia API documentation.
|
77
|
+
https://docs.cartesia.ai/api-reference
|
78
|
+
|
79
|
+
Examples:
|
80
|
+
>>> {"speed": "fastest"}
|
81
|
+
>>> {"speed": "slow", "emotion": ["sadness:high"]}
|
82
|
+
>>> {"emotion": ["surprise:highest", "curiosity"]}
|
83
|
+
|
84
|
+
Note:
|
85
|
+
This is an experimental class and is subject to rapid change in future versions.
|
86
|
+
"""
|
87
|
+
|
88
|
+
speed: str = ""
|
89
|
+
emotion: List[str] = []
|
90
|
+
|
91
|
+
|
73
92
|
class OutputFormat(TypedDict):
|
74
93
|
container: str
|
75
94
|
encoding: str
|
76
95
|
sample_rate: int
|
96
|
+
|
97
|
+
|
98
|
+
class EventType:
|
99
|
+
NULL = ""
|
100
|
+
AUDIO = "chunk"
|
101
|
+
TIMESTAMPS = "timestamps"
|
cartesia/client.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import asyncio
|
2
2
|
import base64
|
3
|
+
from collections import defaultdict
|
3
4
|
import json
|
4
5
|
import os
|
5
6
|
import uuid
|
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
|
|
27
28
|
|
28
29
|
from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
|
29
30
|
from cartesia._types import (
|
31
|
+
EventType,
|
30
32
|
OutputFormat,
|
31
33
|
OutputFormatMapping,
|
32
34
|
DeprecatedOutputFormatMapping,
|
35
|
+
VoiceControls,
|
33
36
|
VoiceMetadata,
|
34
37
|
)
|
35
38
|
|
@@ -295,6 +298,7 @@ class _TTSContext:
|
|
295
298
|
context_id: Optional[str] = None,
|
296
299
|
duration: Optional[int] = None,
|
297
300
|
language: Optional[str] = None,
|
301
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
298
302
|
) -> Generator[bytes, None, None]:
|
299
303
|
"""Send audio generation requests to the WebSocket and yield responses.
|
300
304
|
|
@@ -307,6 +311,8 @@ class _TTSContext:
|
|
307
311
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
308
312
|
duration: The duration of the audio in seconds.
|
309
313
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
314
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
315
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
310
316
|
|
311
317
|
Yields:
|
312
318
|
Dictionary containing the following key(s):
|
@@ -322,7 +328,11 @@ class _TTSContext:
|
|
322
328
|
|
323
329
|
self._websocket.connect()
|
324
330
|
|
325
|
-
voice =
|
331
|
+
voice = TTS._validate_and_construct_voice(
|
332
|
+
voice_id,
|
333
|
+
voice_embedding=voice_embedding,
|
334
|
+
experimental_voice_controls=_experimental_voice_controls,
|
335
|
+
)
|
326
336
|
|
327
337
|
# Create the initial request body
|
328
338
|
request_body = {
|
@@ -482,42 +492,16 @@ class _WebSocket:
|
|
482
492
|
def _convert_response(
|
483
493
|
self, response: Dict[str, any], include_context_id: bool
|
484
494
|
) -> Dict[str, Any]:
|
485
|
-
|
495
|
+
out = {}
|
496
|
+
if response["type"] == EventType.AUDIO:
|
497
|
+
out["audio"] = base64.b64decode(response["data"])
|
498
|
+
elif response["type"] == EventType.TIMESTAMPS:
|
499
|
+
out["word_timestamps"] = response["word_timestamps"]
|
486
500
|
|
487
|
-
optional_kwargs = {}
|
488
501
|
if include_context_id:
|
489
|
-
|
490
|
-
|
491
|
-
return {
|
492
|
-
"audio": audio,
|
493
|
-
**optional_kwargs,
|
494
|
-
}
|
495
|
-
|
496
|
-
def _validate_and_construct_voice(
|
497
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
498
|
-
) -> dict:
|
499
|
-
"""Validate and construct the voice dictionary for the request.
|
500
|
-
|
501
|
-
Args:
|
502
|
-
voice_id: The ID of the voice to use for generating audio.
|
503
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
502
|
+
out["context_id"] = response["context_id"]
|
504
503
|
|
505
|
-
|
506
|
-
A dictionary representing the voice configuration.
|
507
|
-
|
508
|
-
Raises:
|
509
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
510
|
-
"""
|
511
|
-
if voice_id is None and voice_embedding is None:
|
512
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
513
|
-
|
514
|
-
if voice_id is not None and voice_embedding is not None:
|
515
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
516
|
-
|
517
|
-
if voice_id:
|
518
|
-
return {"mode": "id", "id": voice_id}
|
519
|
-
|
520
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
504
|
+
return out
|
521
505
|
|
522
506
|
def send(
|
523
507
|
self,
|
@@ -530,6 +514,8 @@ class _WebSocket:
|
|
530
514
|
duration: Optional[int] = None,
|
531
515
|
language: Optional[str] = None,
|
532
516
|
stream: bool = True,
|
517
|
+
add_timestamps: bool = False,
|
518
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
533
519
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
534
520
|
"""Send a request to the WebSocket to generate audio.
|
535
521
|
|
@@ -543,6 +529,9 @@ class _WebSocket:
|
|
543
529
|
duration: The duration of the audio in seconds.
|
544
530
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
545
531
|
stream: Whether to stream the audio or not.
|
532
|
+
add_timestamps: Whether to return word-level timestamps.
|
533
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
534
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
546
535
|
|
547
536
|
Returns:
|
548
537
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -556,7 +545,11 @@ class _WebSocket:
|
|
556
545
|
if context_id is None:
|
557
546
|
context_id = str(uuid.uuid4())
|
558
547
|
|
559
|
-
voice =
|
548
|
+
voice = TTS._validate_and_construct_voice(
|
549
|
+
voice_id,
|
550
|
+
voice_embedding=voice_embedding,
|
551
|
+
experimental_voice_controls=_experimental_voice_controls,
|
552
|
+
)
|
560
553
|
|
561
554
|
request_body = {
|
562
555
|
"model_id": model_id,
|
@@ -569,6 +562,7 @@ class _WebSocket:
|
|
569
562
|
},
|
570
563
|
"context_id": context_id,
|
571
564
|
"language": language,
|
565
|
+
"add_timestamps": add_timestamps,
|
572
566
|
}
|
573
567
|
|
574
568
|
if duration is not None:
|
@@ -580,10 +574,17 @@ class _WebSocket:
|
|
580
574
|
return generator
|
581
575
|
|
582
576
|
chunks = []
|
577
|
+
word_timestamps = defaultdict(list)
|
583
578
|
for chunk in generator:
|
584
|
-
|
585
|
-
|
586
|
-
|
579
|
+
if "audio" in chunk:
|
580
|
+
chunks.append(chunk["audio"])
|
581
|
+
if add_timestamps and "word_timestamps" in chunk:
|
582
|
+
for k, v in chunk["word_timestamps"].items():
|
583
|
+
word_timestamps[k].extend(v)
|
584
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
585
|
+
if add_timestamps:
|
586
|
+
out["word_timestamps"] = word_timestamps
|
587
|
+
return out
|
587
588
|
|
588
589
|
def _websocket_generator(self, request_body: Dict[str, Any]):
|
589
590
|
self.websocket.send(json.dumps(request_body))
|
@@ -656,32 +657,6 @@ class _SSE:
|
|
656
657
|
break
|
657
658
|
return buffer, outputs
|
658
659
|
|
659
|
-
def _validate_and_construct_voice(
|
660
|
-
self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
|
661
|
-
) -> dict:
|
662
|
-
"""Validate and construct the voice dictionary for the request.
|
663
|
-
|
664
|
-
Args:
|
665
|
-
voice_id: The ID of the voice to use for generating audio.
|
666
|
-
voice_embedding: The embedding of the voice to use for generating audio.
|
667
|
-
|
668
|
-
Returns:
|
669
|
-
A dictionary representing the voice configuration.
|
670
|
-
|
671
|
-
Raises:
|
672
|
-
ValueError: If neither or both voice_id and voice_embedding are specified.
|
673
|
-
"""
|
674
|
-
if voice_id is None and voice_embedding is None:
|
675
|
-
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
676
|
-
|
677
|
-
if voice_id is not None and voice_embedding is not None:
|
678
|
-
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
679
|
-
|
680
|
-
if voice_id:
|
681
|
-
return {"mode": "id", "id": voice_id}
|
682
|
-
|
683
|
-
return {"mode": "embedding", "embedding": voice_embedding}
|
684
|
-
|
685
660
|
def send(
|
686
661
|
self,
|
687
662
|
model_id: str,
|
@@ -692,6 +667,7 @@ class _SSE:
|
|
692
667
|
duration: Optional[int] = None,
|
693
668
|
language: Optional[str] = None,
|
694
669
|
stream: bool = True,
|
670
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
695
671
|
) -> Union[bytes, Generator[bytes, None, None]]:
|
696
672
|
"""Send a request to the server to generate audio using Server-Sent Events.
|
697
673
|
|
@@ -704,6 +680,8 @@ class _SSE:
|
|
704
680
|
duration: The duration of the audio in seconds.
|
705
681
|
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
|
706
682
|
stream: Whether to stream the audio or not.
|
683
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
684
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
707
685
|
|
708
686
|
Returns:
|
709
687
|
If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
|
@@ -711,8 +689,11 @@ class _SSE:
|
|
711
689
|
Both the generator and the dictionary contain the following key(s):
|
712
690
|
- audio: The audio as bytes.
|
713
691
|
"""
|
714
|
-
voice =
|
715
|
-
|
692
|
+
voice = TTS._validate_and_construct_voice(
|
693
|
+
voice_id,
|
694
|
+
voice_embedding=voice_embedding,
|
695
|
+
experimental_voice_controls=_experimental_voice_controls,
|
696
|
+
)
|
716
697
|
request_body = {
|
717
698
|
"model_id": model_id,
|
718
699
|
"transcript": transcript,
|
@@ -826,6 +807,7 @@ class TTS(Resource):
|
|
826
807
|
sample_rate=output_format_obj["sample_rate"],
|
827
808
|
)
|
828
809
|
|
810
|
+
@staticmethod
|
829
811
|
def get_sample_rate(self, output_format_name: str) -> int:
|
830
812
|
"""Convenience method to get the sample rate for a given output format.
|
831
813
|
|
@@ -849,6 +831,40 @@ class TTS(Resource):
|
|
849
831
|
|
850
832
|
return output_format_obj["sample_rate"]
|
851
833
|
|
834
|
+
@staticmethod
|
835
|
+
def _validate_and_construct_voice(
|
836
|
+
voice_id: Optional[str] = None,
|
837
|
+
voice_embedding: Optional[List[float]] = None,
|
838
|
+
experimental_voice_controls: Optional[VoiceControls] = None,
|
839
|
+
) -> dict:
|
840
|
+
"""Validate and construct the voice dictionary for the request.
|
841
|
+
|
842
|
+
Args:
|
843
|
+
voice_id: The ID of the voice to use for generating audio.
|
844
|
+
voice_embedding: The embedding of the voice to use for generating audio.
|
845
|
+
experimental_voice_controls: Voice controls for emotion and speed.
|
846
|
+
Note: This is an experimental feature and may rapidly change in the future.
|
847
|
+
|
848
|
+
Returns:
|
849
|
+
A dictionary representing the voice configuration.
|
850
|
+
|
851
|
+
Raises:
|
852
|
+
ValueError: If neither or both voice_id and voice_embedding are specified.
|
853
|
+
"""
|
854
|
+
if voice_id is None and voice_embedding is None:
|
855
|
+
raise ValueError("Either voice_id or voice_embedding must be specified.")
|
856
|
+
|
857
|
+
if voice_id is not None and voice_embedding is not None:
|
858
|
+
raise ValueError("Only one of voice_id or voice_embedding should be specified.")
|
859
|
+
|
860
|
+
if voice_id:
|
861
|
+
voice = {"mode": "id", "id": voice_id}
|
862
|
+
else:
|
863
|
+
voice = {"mode": "embedding", "embedding": voice_embedding}
|
864
|
+
if experimental_voice_controls is not None:
|
865
|
+
voice["__experimental_controls"] = experimental_voice_controls
|
866
|
+
return voice
|
867
|
+
|
852
868
|
|
853
869
|
class AsyncCartesia(Cartesia):
|
854
870
|
"""The asynchronous version of the Cartesia client."""
|
@@ -946,8 +962,13 @@ class _AsyncSSE(_SSE):
|
|
946
962
|
duration: Optional[int] = None,
|
947
963
|
language: Optional[str] = None,
|
948
964
|
stream: bool = True,
|
965
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
949
966
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
950
|
-
voice =
|
967
|
+
voice = TTS._validate_and_construct_voice(
|
968
|
+
voice_id,
|
969
|
+
voice_embedding=voice_embedding,
|
970
|
+
experimental_voice_controls=_experimental_voice_controls,
|
971
|
+
)
|
951
972
|
|
952
973
|
request_body = {
|
953
974
|
"model_id": model_id,
|
@@ -1043,6 +1064,8 @@ class _AsyncTTSContext:
|
|
1043
1064
|
continue_: bool = False,
|
1044
1065
|
duration: Optional[int] = None,
|
1045
1066
|
language: Optional[str] = None,
|
1067
|
+
add_timestamps: bool = False,
|
1068
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1046
1069
|
) -> None:
|
1047
1070
|
"""Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
|
1048
1071
|
|
@@ -1055,7 +1078,10 @@ class _AsyncTTSContext:
|
|
1055
1078
|
context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
|
1056
1079
|
continue_: Whether to continue the audio generation from the previous transcript or not.
|
1057
1080
|
duration: The duration of the audio in seconds.
|
1058
|
-
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual
|
1081
|
+
language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
|
1082
|
+
add_timestamps: Whether to return word-level timestamps.
|
1083
|
+
_experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
|
1084
|
+
Note: This is an experimental feature and may change rapidly in future releases.
|
1059
1085
|
|
1060
1086
|
Returns:
|
1061
1087
|
None.
|
@@ -1067,7 +1093,9 @@ class _AsyncTTSContext:
|
|
1067
1093
|
|
1068
1094
|
await self._websocket.connect()
|
1069
1095
|
|
1070
|
-
voice =
|
1096
|
+
voice = TTS._validate_and_construct_voice(
|
1097
|
+
voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
|
1098
|
+
)
|
1071
1099
|
|
1072
1100
|
request_body = {
|
1073
1101
|
"model_id": model_id,
|
@@ -1081,6 +1109,7 @@ class _AsyncTTSContext:
|
|
1081
1109
|
"context_id": self._context_id,
|
1082
1110
|
"continue": continue_,
|
1083
1111
|
"language": language,
|
1112
|
+
"add_timestamps": add_timestamps,
|
1084
1113
|
}
|
1085
1114
|
|
1086
1115
|
if duration is not None:
|
@@ -1234,7 +1263,10 @@ class _AsyncWebSocket(_WebSocket):
|
|
1234
1263
|
duration: Optional[int] = None,
|
1235
1264
|
language: Optional[str] = None,
|
1236
1265
|
stream: bool = True,
|
1266
|
+
add_timestamps: bool = False,
|
1267
|
+
_experimental_voice_controls: Optional[VoiceControls] = None,
|
1237
1268
|
) -> Union[bytes, AsyncGenerator[bytes, None]]:
|
1269
|
+
"""See :meth:`_WebSocket.send` for details."""
|
1238
1270
|
if context_id is None:
|
1239
1271
|
context_id = str(uuid.uuid4())
|
1240
1272
|
|
@@ -1250,6 +1282,8 @@ class _AsyncWebSocket(_WebSocket):
|
|
1250
1282
|
duration=duration,
|
1251
1283
|
language=language,
|
1252
1284
|
continue_=False,
|
1285
|
+
add_timestamps=add_timestamps,
|
1286
|
+
_experimental_voice_controls=_experimental_voice_controls,
|
1253
1287
|
)
|
1254
1288
|
|
1255
1289
|
generator = ctx.receive()
|
@@ -1258,10 +1292,17 @@ class _AsyncWebSocket(_WebSocket):
|
|
1258
1292
|
return generator
|
1259
1293
|
|
1260
1294
|
chunks = []
|
1295
|
+
word_timestamps = defaultdict(list)
|
1261
1296
|
async for chunk in generator:
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1297
|
+
if "audio" in chunk:
|
1298
|
+
chunks.append(chunk["audio"])
|
1299
|
+
if add_timestamps and "word_timestamps" in chunk:
|
1300
|
+
for k, v in chunk["word_timestamps"].items():
|
1301
|
+
word_timestamps[k].extend(v)
|
1302
|
+
out = {"audio": b"".join(chunks), "context_id": context_id}
|
1303
|
+
if add_timestamps:
|
1304
|
+
out["word_timestamps"] = word_timestamps
|
1305
|
+
return out
|
1265
1306
|
|
1266
1307
|
async def _process_responses(self):
|
1267
1308
|
try:
|
cartesia/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.0.
|
1
|
+
__version__ = "1.0.7"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cartesia
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: The official Python library for the Cartesia API.
|
5
5
|
Home-page:
|
6
6
|
Author: Cartesia, Inc.
|
@@ -419,6 +419,34 @@ p.terminate()
|
|
419
419
|
ws.close() # Close the websocket connection
|
420
420
|
```
|
421
421
|
|
422
|
+
### Generating timestamps using WebSocket
|
423
|
+
|
424
|
+
The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
|
425
|
+
- words (list): The individual words in the transcript.
|
426
|
+
- start (list): The starting timestamp for each word (in seconds).
|
427
|
+
- end (list): The ending timestamp for each word (in seconds).
|
428
|
+
|
429
|
+
```python
|
430
|
+
response = ws.send(
|
431
|
+
model_id=model_id,
|
432
|
+
transcript=transcript,
|
433
|
+
voice_id=voice_id,
|
434
|
+
output_format=output_format,
|
435
|
+
stream=False,
|
436
|
+
add_timestamps=True
|
437
|
+
)
|
438
|
+
|
439
|
+
# Accessing the word_timestamps object
|
440
|
+
word_timestamps = response['word_timestamps']
|
441
|
+
|
442
|
+
words = word_timestamps['words']
|
443
|
+
start_times = word_timestamps['start']
|
444
|
+
end_times = word_timestamps['end']
|
445
|
+
|
446
|
+
for word, start, end in zip(words, start_times, end_times):
|
447
|
+
print(f"Word: {word}, Start: {start}, End: {end}")
|
448
|
+
```
|
449
|
+
|
422
450
|
### Multilingual Text-to-Speech [Alpha]
|
423
451
|
|
424
452
|
You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
|
@@ -472,6 +500,31 @@ stream.close()
|
|
472
500
|
p.terminate()
|
473
501
|
```
|
474
502
|
|
503
|
+
### Speed and Emotion Control [Experimental]
|
504
|
+
|
505
|
+
You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
|
506
|
+
|
507
|
+
Speed Options:
|
508
|
+
- `slowest`, `slow`, `normal`, `fast`, `fastest`
|
509
|
+
|
510
|
+
Emotion Options:
|
511
|
+
Use a list of tags in the format `emotion_name:level` where:
|
512
|
+
- Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
|
513
|
+
- Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
|
514
|
+
The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
|
515
|
+
|
516
|
+
```python
|
517
|
+
ws.send(
|
518
|
+
model_id=model_id,
|
519
|
+
transcript=transcript,
|
520
|
+
voice_id=voice_id,
|
521
|
+
output_format=output_format,
|
522
|
+
_experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
|
523
|
+
)
|
524
|
+
```
|
525
|
+
|
526
|
+
### Jupyter Notebook Usage
|
527
|
+
|
475
528
|
If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
|
476
529
|
Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
|
477
530
|
|
@@ -0,0 +1,12 @@
|
|
1
|
+
cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
|
2
|
+
cartesia/_types.py,sha256=Lcp4GOot5UfI0EveDi2QdNALMo1rK4PwUrtMvW5P6vY,4406
|
3
|
+
cartesia/client.py,sha256=1T_HboqHZO6wjUDYpuWI7igV-QF_cRL4DY7v4NDzApo,51871
|
4
|
+
cartesia/version.py,sha256=BW7SWRpHoxuOQZ67pS20yog2LWYl-nK7-BEFBNrHGgA,22
|
5
|
+
cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
|
7
|
+
cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
|
8
|
+
cartesia-1.0.7.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
|
9
|
+
cartesia-1.0.7.dist-info/METADATA,sha256=vvU7-K0raiw4hmotlST5wi6uSnGiXjMpHxd2CIzvbMc,20336
|
10
|
+
cartesia-1.0.7.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
|
11
|
+
cartesia-1.0.7.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
12
|
+
cartesia-1.0.7.dist-info/RECORD,,
|
cartesia-1.0.5.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
|
2
|
-
cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
|
3
|
-
cartesia/client.py,sha256=46XiKTXa0gBXJ_GftMtLHAzBoX0GmWz_aWYuG68jaNQ,49316
|
4
|
-
cartesia/version.py,sha256=B9kKWJLln1i8LjtkcYecvNWGLTrez4gCUOHtnPlInFo,22
|
5
|
-
cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
|
7
|
-
cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
|
8
|
-
cartesia-1.0.5.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
|
9
|
-
cartesia-1.0.5.dist-info/METADATA,sha256=PImHYCNoo7iSnm3Br6PuRdqvli92c7AyXR4iagdv-d8,18368
|
10
|
-
cartesia-1.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
|
11
|
-
cartesia-1.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
|
12
|
-
cartesia-1.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|