cartesia 1.0.6__py2.py3-none-any.whl → 1.0.7__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cartesia/_types.py CHANGED
@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
45
45
  "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
46
46
  "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
47
47
  }
48
-
48
+
49
49
  @classmethod
50
50
  @deprecated(
51
51
  vdeprecated="1.0.1",
@@ -74,18 +74,19 @@ class VoiceControls(TypedDict):
74
74
  """Defines different voice control parameters for voice synthesis.
75
75
 
76
76
  For a complete list of supported parameters, refer to the Cartesia API documentation.
77
- https://docs.cartesia.ai/getting-started/welcome
77
+ https://docs.cartesia.ai/api-reference
78
78
 
79
79
  Examples:
80
80
  >>> {"speed": "fastest"}
81
- >>> {"speed": "slow", "emotion": "anger:high, positivity:low"}
82
- >>> {"emotion": "surprise:high, positivity:high"}
81
+ >>> {"speed": "slow", "emotion": ["sadness:high"]}
82
+ >>> {"emotion": ["surprise:highest", "curiosity"]}
83
83
 
84
84
  Note:
85
85
  This is an experimental class and is subject to rapid change in future versions.
86
86
  """
87
+
87
88
  speed: str = ""
88
- emotion: str = ""
89
+ emotion: List[str] = []
89
90
 
90
91
 
91
92
  class OutputFormat(TypedDict):
cartesia/client.py CHANGED
@@ -328,7 +328,11 @@ class _TTSContext:
328
328
 
329
329
  self._websocket.connect()
330
330
 
331
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
331
+ voice = TTS._validate_and_construct_voice(
332
+ voice_id,
333
+ voice_embedding=voice_embedding,
334
+ experimental_voice_controls=_experimental_voice_controls,
335
+ )
332
336
 
333
337
  # Create the initial request body
334
338
  request_body = {
@@ -493,7 +497,7 @@ class _WebSocket:
493
497
  out["audio"] = base64.b64decode(response["data"])
494
498
  elif response["type"] == EventType.TIMESTAMPS:
495
499
  out["word_timestamps"] = response["word_timestamps"]
496
-
500
+
497
501
  if include_context_id:
498
502
  out["context_id"] = response["context_id"]
499
503
 
@@ -541,7 +545,11 @@ class _WebSocket:
541
545
  if context_id is None:
542
546
  context_id = str(uuid.uuid4())
543
547
 
544
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls = _experimental_voice_controls)
548
+ voice = TTS._validate_and_construct_voice(
549
+ voice_id,
550
+ voice_embedding=voice_embedding,
551
+ experimental_voice_controls=_experimental_voice_controls,
552
+ )
545
553
 
546
554
  request_body = {
547
555
  "model_id": model_id,
@@ -681,7 +689,11 @@ class _SSE:
681
689
  Both the generator and the dictionary contain the following key(s):
682
690
  - audio: The audio as bytes.
683
691
  """
684
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding, experimental_voice_controls=_experimental_voice_controls)
692
+ voice = TTS._validate_and_construct_voice(
693
+ voice_id,
694
+ voice_embedding=voice_embedding,
695
+ experimental_voice_controls=_experimental_voice_controls,
696
+ )
685
697
  request_body = {
686
698
  "model_id": model_id,
687
699
  "transcript": transcript,
@@ -795,6 +807,7 @@ class TTS(Resource):
795
807
  sample_rate=output_format_obj["sample_rate"],
796
808
  )
797
809
 
810
+ @staticmethod
798
811
  def get_sample_rate(self, output_format_name: str) -> int:
799
812
  """Convenience method to get the sample rate for a given output format.
800
813
 
@@ -818,6 +831,40 @@ class TTS(Resource):
818
831
 
819
832
  return output_format_obj["sample_rate"]
820
833
 
834
+ @staticmethod
835
+ def _validate_and_construct_voice(
836
+ voice_id: Optional[str] = None,
837
+ voice_embedding: Optional[List[float]] = None,
838
+ experimental_voice_controls: Optional[VoiceControls] = None,
839
+ ) -> dict:
840
+ """Validate and construct the voice dictionary for the request.
841
+
842
+ Args:
843
+ voice_id: The ID of the voice to use for generating audio.
844
+ voice_embedding: The embedding of the voice to use for generating audio.
845
+ experimental_voice_controls: Voice controls for emotion and speed.
846
+ Note: This is an experimental feature and may rapidly change in the future.
847
+
848
+ Returns:
849
+ A dictionary representing the voice configuration.
850
+
851
+ Raises:
852
+ ValueError: If neither or both voice_id and voice_embedding are specified.
853
+ """
854
+ if voice_id is None and voice_embedding is None:
855
+ raise ValueError("Either voice_id or voice_embedding must be specified.")
856
+
857
+ if voice_id is not None and voice_embedding is not None:
858
+ raise ValueError("Only one of voice_id or voice_embedding should be specified.")
859
+
860
+ if voice_id:
861
+ voice = {"mode": "id", "id": voice_id}
862
+ else:
863
+ voice = {"mode": "embedding", "embedding": voice_embedding}
864
+ if experimental_voice_controls is not None:
865
+ voice["__experimental_controls"] = experimental_voice_controls
866
+ return voice
867
+
821
868
 
822
869
  class AsyncCartesia(Cartesia):
823
870
  """The asynchronous version of the Cartesia client."""
@@ -917,7 +964,11 @@ class _AsyncSSE(_SSE):
917
964
  stream: bool = True,
918
965
  _experimental_voice_controls: Optional[VoiceControls] = None,
919
966
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
920
- voice = _validate_and_construct_voice(voice_id, voice_embedding=voice_embedding,experimental_voice_controls=_experimental_voice_controls)
967
+ voice = TTS._validate_and_construct_voice(
968
+ voice_id,
969
+ voice_embedding=voice_embedding,
970
+ experimental_voice_controls=_experimental_voice_controls,
971
+ )
921
972
 
922
973
  request_body = {
923
974
  "model_id": model_id,
@@ -1042,7 +1093,9 @@ class _AsyncTTSContext:
1042
1093
 
1043
1094
  await self._websocket.connect()
1044
1095
 
1045
- voice = _validate_and_construct_voice(voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls)
1096
+ voice = TTS._validate_and_construct_voice(
1097
+ voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
1098
+ )
1046
1099
 
1047
1100
  request_body = {
1048
1101
  "model_id": model_id,
@@ -1229,7 +1282,7 @@ class _AsyncWebSocket(_WebSocket):
1229
1282
  duration=duration,
1230
1283
  language=language,
1231
1284
  continue_=False,
1232
- add_timestamps = add_timestamps,
1285
+ add_timestamps=add_timestamps,
1233
1286
  _experimental_voice_controls=_experimental_voice_controls,
1234
1287
  )
1235
1288
 
@@ -1299,35 +1352,3 @@ class AsyncTTS(TTS):
1299
1352
  )
1300
1353
  await ws.connect()
1301
1354
  return ws
1302
-
1303
-
1304
- def _validate_and_construct_voice(
1305
- voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None, experimental_voice_controls: Optional[VoiceControls] = None
1306
- ) -> dict:
1307
- """Validate and construct the voice dictionary for the request.
1308
-
1309
- Args:
1310
- voice_id: The ID of the voice to use for generating audio.
1311
- voice_embedding: The embedding of the voice to use for generating audio.
1312
- experimental_voice_controls: Voice controls for emotion and speed.
1313
- Note: This is an experimental feature and may rapidly change in the future.
1314
-
1315
- Returns:
1316
- A dictionary representing the voice configuration.
1317
-
1318
- Raises:
1319
- ValueError: If neither or both voice_id and voice_embedding are specified.
1320
- """
1321
- if voice_id is None and voice_embedding is None:
1322
- raise ValueError("Either voice_id or voice_embedding must be specified.")
1323
-
1324
- if voice_id is not None and voice_embedding is not None:
1325
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
1326
-
1327
- if voice_id:
1328
- voice = {"mode": "id", "id": voice_id}
1329
- else:
1330
- voice = {"mode": "embedding", "embedding": voice_embedding}
1331
- if experimental_voice_controls is not None:
1332
- voice["__experimental_controls"] = experimental_voice_controls
1333
- return voice
cartesia/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.0.6"
1
+ __version__ = "1.0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.6
3
+ Version: 1.0.7
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -419,6 +419,34 @@ p.terminate()
419
419
  ws.close() # Close the websocket connection
420
420
  ```
421
421
 
422
+ ### Generating timestamps using WebSocket
423
+
424
+ The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
425
+ - words (list): The individual words in the transcript.
426
+ - start (list): The starting timestamp for each word (in seconds).
427
+ - end (list): The ending timestamp for each word (in seconds).
428
+
429
+ ```python
430
+ response = ws.send(
431
+ model_id=model_id,
432
+ transcript=transcript,
433
+ voice_id=voice_id,
434
+ output_format=output_format,
435
+ stream=False,
436
+ add_timestamps=True
437
+ )
438
+
439
+ # Accessing the word_timestamps object
440
+ word_timestamps = response['word_timestamps']
441
+
442
+ words = word_timestamps['words']
443
+ start_times = word_timestamps['start']
444
+ end_times = word_timestamps['end']
445
+
446
+ for word, start, end in zip(words, start_times, end_times):
447
+ print(f"Word: {word}, Start: {start}, End: {end}")
448
+ ```
449
+
422
450
  ### Multilingual Text-to-Speech [Alpha]
423
451
 
424
452
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -472,6 +500,31 @@ stream.close()
472
500
  p.terminate()
473
501
  ```
474
502
 
503
+ ### Speed and Emotion Control [Experimental]
504
+
505
+ You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
506
+
507
+ Speed Options:
508
+ - `slowest`, `slow`, `normal`, `fast`, `fastest`
509
+
510
+ Emotion Options:
511
+ Use a list of tags in the format `emotion_name:level` where:
512
+ - Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
513
+ - Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
514
+ The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
515
+
516
+ ```python
517
+ ws.send(
518
+ model_id=model_id,
519
+ transcript=transcript,
520
+ voice_id=voice_id,
521
+ output_format=output_format,
522
+ _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
523
+ )
524
+ ```
525
+
526
+ ### Jupyter Notebook Usage
527
+
475
528
  If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
476
529
  Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
477
530
 
@@ -0,0 +1,12 @@
1
+ cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
+ cartesia/_types.py,sha256=Lcp4GOot5UfI0EveDi2QdNALMo1rK4PwUrtMvW5P6vY,4406
3
+ cartesia/client.py,sha256=1T_HboqHZO6wjUDYpuWI7igV-QF_cRL4DY7v4NDzApo,51871
4
+ cartesia/version.py,sha256=BW7SWRpHoxuOQZ67pS20yog2LWYl-nK7-BEFBNrHGgA,22
5
+ cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
+ cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
+ cartesia-1.0.7.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
+ cartesia-1.0.7.dist-info/METADATA,sha256=vvU7-K0raiw4hmotlST5wi6uSnGiXjMpHxd2CIzvbMc,20336
10
+ cartesia-1.0.7.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
+ cartesia-1.0.7.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
+ cartesia-1.0.7.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- cartesia/__init__.py,sha256=jMIf2O7dTGxvTA5AfXtmh1H_EGfMtQseR5wXrjNRbLs,93
2
- cartesia/_types.py,sha256=l3tKFnyUInn5_OJOSB63Mp1g16p9R23VNAuJ5qykOzY,4424
3
- cartesia/client.py,sha256=zLyxaDkX0et6lY_hthSgDA-eoP6NXEN5ysDsxxseyZQ,51502
4
- cartesia/version.py,sha256=mqMuQB3aqJVPrHHqJMLjqiMKUiJjozc7EPLcX5DpKHg,22
5
- cartesia/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- cartesia/utils/deprecated.py,sha256=2cXvGtrxhPeUZA5LWy2n_U5OFLDv7SHeFtzqhjSJGyk,1674
7
- cartesia/utils/retry.py,sha256=nuwWRfu3MOVTxIQMLjYf6WLaxSlnu_GdE3QjTV0zisQ,3339
8
- cartesia-1.0.6.dist-info/LICENSE.md,sha256=PT2YG5wEtEX1TNDn5sXkUXqbn-neyr7cZenTxd40ql4,1074
9
- cartesia-1.0.6.dist-info/METADATA,sha256=JcNWr0UHSp_GK3X05YD92zbLZonV0BkeyuzT90HuGSs,18368
10
- cartesia-1.0.6.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
11
- cartesia-1.0.6.dist-info/top_level.txt,sha256=rTX4HnnCegMxl1FK9czpVC7GAvf3SwDzPG65qP-BS4w,9
12
- cartesia-1.0.6.dist-info/RECORD,,