cartesia 1.0.5__py2.py3-none-any.whl → 1.0.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/_types.py CHANGED
@@ -70,7 +70,31 @@ class VoiceMetadata(TypedDict):
70
70
  language: str
71
71
 
72
72
 
73
+ class VoiceControls(TypedDict):
74
+ """Defines different voice control parameters for voice synthesis.
75
+
76
+ For a complete list of supported parameters, refer to the Cartesia API documentation.
77
+ https://docs.cartesia.ai/getting-started/welcome
78
+
79
+ Examples:
80
+ >>> {"speed": "fastest"}
81
+ >>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
82
+ >>> {"emotion": "surprise:high, positivity:high"}
83
+
84
+ Note:
85
+ This is an experimental class and is subject to rapid change in future versions.
86
+ """
87
+ speed: str = ""
88
+ emotion: str = ""
89
+
90
+
73
91
  class OutputFormat(TypedDict):
74
92
  container: str
75
93
  encoding: str
76
94
  sample_rate: int
95
+
96
+
97
+ class EventType:
98
+ NULL = ""
99
+ AUDIO = "chunk"
100
+ TIMESTAMPS = "timestamps"
cartesia/client.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import base64
3
+ from collections import defaultdict
3
4
  import json
4
5
  import os
5
6
  import uuid
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
27
28
 
28
29
  from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
29
30
  from cartesia._types import (
31
+ EventType,
30
32
  OutputFormat,
31
33
  OutputFormatMapping,
32
34
  DeprecatedOutputFormatMapping,
35
+ VoiceControls,
33
36
  VoiceMetadata,
34
37
  )
35
38
 
@@ -295,6 +298,7 @@ class _TTSContext:
295
298
  context_id: Optional[str] = None,
296
299
  duration: Optional[int] = None,
297
300
  language: Optional[str] = None,
301
+ _experimental_voice_controls: Optional[VoiceControls] = None,
298
302
  ) -> Generator[bytes, None, None]:
299
303
  """Send audio generation requests to the WebSocket and yield responses.
300
304
 
@@ -307,6 +311,8 @@ class _TTSContext:
307
311
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
308
312
  duration: The duration of the audio in seconds.
309
313
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
314
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
315
+ Note: This is an experimental feature and may change rapidly in future releases.
310
316
 
311
317
  Yields:
312
318
  Dictionary containing the following key(s):
@@ -322,7 +328,7 @@ class _TTSContext:
322
328
 
323
329
  self._websocket.connect()
324
330
 
325
- voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
331
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
326
332
 
327
333
  # Create the initial request body
328
334
  request_body = {
@@ -482,42 +488,16 @@ class _WebSocket:
482
488
  def _convert_response(
483
489
  self, response: Dict[str, any], include_context_id: bool
484
490
  ) -> Dict[str, Any]:
485
- audio = base64.b64decode(response["data"])
486
-
487
- optional_kwargs = {}
491
+ out = {}
492
+ if response["type"] == EventType.AUDIO:
493
+ out["audio"] = base64.b64decode(response["data"])
494
+ elif response["type"] == EventType.TIMESTAMPS:
495
+ out["word_timestamps"] = response["word_timestamps"]
496
+
488
497
  if include_context_id:
489
- optional_kwargs["context_id"] = response["context_id"]
490
-
491
- return {
492
- "audio": audio,
493
- **optional_kwargs,
494
- }
495
-
496
- def _validate_and_construct_voice(
497
- self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
498
- ) -> dict:
499
- """Validate and construct the voice dictionary for the request.
500
-
501
- Args:
502
- voice_id: The ID of the voice to use for generating audio.
503
- voice_embedding: The embedding of the voice to use for generating audio.
504
-
505
- Returns:
506
- A dictionary representing the voice configuration.
498
+ out["context_id"] = response["context_id"]
507
499
 
508
- Raises:
509
- ValueError: If neither or both voice_id and voice_embedding are specified.
510
- """
511
- if voice_id is None and voice_embedding is None:
512
- raise ValueError("Either voice_id or voice_embedding must be specified.")
513
-
514
- if voice_id is not None and voice_embedding is not None:
515
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
516
-
517
- if voice_id:
518
- return {"mode": "id", "id": voice_id}
519
-
520
- return {"mode": "embedding", "embedding": voice_embedding}
500
+ return out
521
501
 
522
502
  def send(
523
503
  self,
@@ -530,6 +510,8 @@ class _WebSocket:
530
510
  duration: Optional[int] = None,
531
511
  language: Optional[str] = None,
532
512
  stream: bool = True,
513
+ add_timestamps: bool = False,
514
+ _experimental_voice_controls: Optional[VoiceControls] = None,
533
515
  ) -> Union[bytes, Generator[bytes, None, None]]:
534
516
  """Send a request to the WebSocket to generate audio.
535
517
 
@@ -543,6 +525,9 @@ class _WebSocket:
543
525
  duration: The duration of the audio in seconds.
544
526
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
545
527
  stream: Whether to stream the audio or not.
528
+ add_timestamps: Whether to return word-level timestamps.
529
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
530
+ Note: This is an experimental feature and may change rapidly in future releases.
546
531
 
547
532
  Returns:
548
533
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -556,7 +541,7 @@ class _WebSocket:
556
541
  if context_id is None:
557
542
  context_id = str(uuid.uuid4())
558
543
 
559
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
544
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
560
545
 
561
546
  request_body = {
562
547
  "model_id": model_id,
@@ -569,6 +554,7 @@ class _WebSocket:
569
554
  },
570
555
  "context_id": context_id,
571
556
  "language": language,
557
+ "add_timestamps": add_timestamps,
572
558
  }
573
559
 
574
560
  if duration is not None:
@@ -580,10 +566,17 @@ class _WebSocket:
580
566
  return generator
581
567
 
582
568
  chunks = []
569
+ word_timestamps = defaultdict(list)
583
570
  for chunk in generator:
584
- chunks.append(chunk["audio"])
585
-
586
- return {"audio": b"".join(chunks), "context_id": context_id}
571
+ if "audio" in chunk:
572
+ chunks.append(chunk["audio"])
573
+ if add_timestamps and "word_timestamps" in chunk:
574
+ for k, v in chunk["word_timestamps"].items():
575
+ word_timestamps[k].extend(v)
576
+ out = {"audio": b"".join(chunks), "context_id": context_id}
577
+ if add_timestamps:
578
+ out["word_timestamps"] = word_timestamps
579
+ return out
587
580
 
588
581
  def _websocket_generator(self, request_body: Dict[str, Any]):
589
582
  self.websocket.send(json.dumps(request_body))
@@ -656,32 +649,6 @@ class _SSE:
656
649
  break
657
650
  return buffer, outputs
658
651
 
659
- def _validate_and_construct_voice(
660
- self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
661
- ) -> dict:
662
- """Validate and construct the voice dictionary for the request.
663
-
664
- Args:
665
- voice_id: The ID of the voice to use for generating audio.
666
- voice_embedding: The embedding of the voice to use for generating audio.
667
-
668
- Returns:
669
- A dictionary representing the voice configuration.
670
-
671
- Raises:
672
- ValueError: If neither or both voice_id and voice_embedding are specified.
673
- """
674
- if voice_id is None and voice_embedding is None:
675
- raise ValueError("Either voice_id or voice_embedding must be specified.")
676
-
677
- if voice_id is not None and voice_embedding is not None:
678
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
679
-
680
- if voice_id:
681
- return {"mode": "id", "id": voice_id}
682
-
683
- return {"mode": "embedding", "embedding": voice_embedding}
684
-
685
652
  def send(
686
653
  self,
687
654
  model_id: str,
@@ -692,6 +659,7 @@ class _SSE:
692
659
  duration: Optional[int] = None,
693
660
  language: Optional[str] = None,
694
661
  stream: bool = True,
662
+ _experimental_voice_controls: Optional[VoiceControls] = None,
695
663
  ) -> Union[bytes, Generator[bytes, None, None]]:
696
664
  """Send a request to the server to generate audio using Server-Sent Events.
697
665
 
@@ -704,6 +672,8 @@ class _SSE:
704
672
  duration: The duration of the audio in seconds.
705
673
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
706
674
  stream: Whether to stream the audio or not.
675
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
676
+ Note: This is an experimental feature and may change rapidly in future releases.
707
677
 
708
678
  Returns:
709
679
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -711,8 +681,7 @@ class _SSE:
711
681
  Both the generator and the dictionary contain the following key(s):
712
682
  - audio: The audio as bytes.
713
683
  """
714
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
715
-
684
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
716
685
  request_body = {
717
686
  "model_id": model_id,
718
687
  "transcript": transcript,
@@ -946,8 +915,9 @@ class _AsyncSSE(_SSE):
946
915
  duration: Optional[int] = None,
947
916
  language: Optional[str] = None,
948
917
  stream: bool = True,
918
+ _experimental_voice_controls: Optional[VoiceControls] = None,
949
919
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
950
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
920
+ voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
951
921
 
952
922
  request_body = {
953
923
  "model_id": model_id,
@@ -1043,6 +1013,8 @@ class _AsyncTTSContext:
1043
1013
  continue_: bool = False,
1044
1014
  duration: Optional[int] = None,
1045
1015
  language: Optional[str] = None,
1016
+ add_timestamps: bool = False,
1017
+ _experimental_voice_controls: Optional[VoiceControls] = None,
1046
1018
  ) -> None:
1047
1019
  """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
1048
1020
 
@@ -1055,7 +1027,10 @@ class _AsyncTTSContext:
1055
1027
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
1056
1028
  continue_: Whether to continue the audio generation from the previous transcript or not.
1057
1029
  duration: The duration of the audio in seconds.
1058
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
1030
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
1031
+ add_timestamps: Whether to return word-level timestamps.
1032
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
1033
+ Note: This is an experimental feature and may change rapidly in future releases.
1059
1034
 
1060
1035
  Returns:
1061
1036
  None.
@@ -1067,7 +1042,7 @@ class _AsyncTTSContext:
1067
1042
 
1068
1043
  await self._websocket.connect()
1069
1044
 
1070
- voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
1045
+ voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
1071
1046
 
1072
1047
  request_body = {
1073
1048
  "model_id": model_id,
@@ -1081,6 +1056,7 @@ class _AsyncTTSContext:
1081
1056
  "context_id": self._context_id,
1082
1057
  "continue": continue_,
1083
1058
  "language": language,
1059
+ "add_timestamps": add_timestamps,
1084
1060
  }
1085
1061
 
1086
1062
  if duration is not None:
@@ -1234,7 +1210,10 @@ class _AsyncWebSocket(_WebSocket):
1234
1210
  duration: Optional[int] = None,
1235
1211
  language: Optional[str] = None,
1236
1212
  stream: bool = True,
1213
+ add_timestamps: bool = False,
1214
+ _experimental_voice_controls: Optional[VoiceControls] = None,
1237
1215
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
1216
+ """See :meth:`_WebSocket.send` for details."""
1238
1217
  if context_id is None:
1239
1218
  context_id = str(uuid.uuid4())
1240
1219
 
@@ -1250,6 +1229,8 @@ class _AsyncWebSocket(_WebSocket):
1250
1229
  duration=duration,
1251
1230
  language=language,
1252
1231
  continue_=False,
1232
+ add_timestamps = add_timestamps,
1233
+ _experimental_voice_controls=_experimental_voice_controls,
1253
1234
  )
1254
1235
 
1255
1236
  generator = ctx.receive()
@@ -1258,10 +1239,17 @@ class _AsyncWebSocket(_WebSocket):
1258
1239
  return generator
1259
1240
 
1260
1241
  chunks = []
1242
+ word_timestamps = defaultdict(list)
1261
1243
  async for chunk in generator:
1262
- chunks.append(chunk["audio"])
1263
-
1264
- return {"audio": b"".join(chunks), "context_id": context_id}
1244
+ if "audio" in chunk:
1245
+ chunks.append(chunk["audio"])
1246
+ if add_timestamps and "word_timestamps" in chunk:
1247
+ for k, v in chunk["word_timestamps"].items():
1248
+ word_timestamps[k].extend(v)
1249
+ out = {"audio": b"".join(chunks), "context_id": context_id}
1250
+ if add_timestamps:
1251
+ out["word_timestamps"] = word_timestamps
1252
+ return out
1265
1253
 
1266
1254
  async def _process_responses(self):
1267
1255
  try:
@@ -1311,3 +1299,35 @@ class AsyncTTS(TTS):
1311
1299
  )
1312
1300
  await ws.connect()
1313
1301
  return ws
1302
+
1303
+
1304
+ def _validate_and_construct_voice(
1305
+ voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
1306
+ ) -> dict:
1307
+ """Validate and construct the voice dictionary for the request.
1308
+
1309
+ Args:
1310
+ voice_id: The ID of the voice to use for generating audio.
1311
+ voice_embedding: The embedding of the voice to use for generating audio.
1312
+ experimental_voice_controls: Voice controls for emotion and speed.
1313
+ Note: This is an experimental feature and may rapidly change in the future.
1314
+
1315
+ Returns:
1316
+ A dictionary representing the voice configuration.
1317
+
1318
+ Raises:
1319
+ ValueError: If neither or both voice_id and voice_embedding are specified.
1320
+ """
1321
+ if voice_id is None and voice_embedding is None:
1322
+ raise ValueError("Either voice_id or voice_embedding must be specified.")
1323
+
1324
+ if voice_id is not None and voice_embedding is not None:
1325
+ raise ValueError("Only one of voice_id or voice_embedding should be specified.")
1326
+
1327
+ if voice_id:
1328
+ voice = {"mode": "id", "id": voice_id}
1329
+ else:
1330
+ voice = {"mode": "embedding", "embedding": voice_embedding}
1331
+ if experimental_voice_controls is not None:
1332
+ voice["__experimental_controls"] = experimental_voice_controls
1333
+ return voice
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.5"
1
+ __version__ = "1.0.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -0,0 +1,12 @@
1
+ cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
+ cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
3
+ cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
4
+ cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
5
+ cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
+ cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
+ cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
+ cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
10
+ cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
+ cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
+ cartesia-1.0.6.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
- cartesia/_types.py,sha256=tO3Nef_V78TDMKDuIv_wsQLkxoSvYG4bdzFkMGXUFho,3765
3
- cartesia/client.py,sha256=46XiKTXa0gBXJ_GftMtLHAzBoX0GmWz_aWYuG68jaNQ,49316
4
- cartesia/version.py,sha256=B9kKWJLln1i8LjtkcYecvNWGLTrez4gCUOHtnPlInFo,22
5
- cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
- cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
- cartesia-1.0.5.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
- cartesia-1.0.5.dist-info/METADATA,sha256=PImHYCNoo7iSnm3Br6PuRdqvli92c7AyXR4iagdv-d8,18368
10
- cartesia-1.0.5.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
- cartesia-1.0.5.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
- cartesia-1.0.5.dist-info/RECORD,,