cartesia 1.0.5__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -401,6 +401,34 @@ p.terminate()
401
401
  ws.close() # Close the websocket connection
402
402
  ```
403
403
 
404
+ ### Generating timestamps using WebSocket
405
+
406
+ The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
407
+ - words (list): The individual words in the transcript.
408
+ - start (list): The starting timestamp for each word (in seconds).
409
+ - end (list): The ending timestamp for each word (in seconds).
410
+
411
+ ```python
412
+ response = ws.send(
413
+ model_id=model_id,
414
+ transcript=transcript,
415
+ voice_id=voice_id,
416
+ output_format=output_format,
417
+ stream=False,
418
+ add_timestamps=True
419
+ )
420
+
421
+ # Accessing the word_timestamps object
422
+ word_timestamps = response['word_timestamps']
423
+
424
+ words = word_timestamps['words']
425
+ start_times = word_timestamps['start']
426
+ end_times = word_timestamps['end']
427
+
428
+ for word, start, end in zip(words, start_times, end_times):
429
+ print(f"Word: {word}, Start: {start}, End: {end}")
430
+ ```
431
+
404
432
  ### Multilingual Text-to-Speech [Alpha]
405
433
 
406
434
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -454,6 +482,31 @@ stream.close()
454
482
  p.terminate()
455
483
  ```
456
484
 
485
+ ### Speed and Emotion Control [Experimental]
486
+
487
+ You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
488
+
489
+ Speed Options:
490
+ - `slowest`, `slow`, `normal`, `fast`, `fastest`
491
+
492
+ Emotion Options:
493
+ Use a list of tags in the format `emotion_name:level` where:
494
+ - Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
495
+ - Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
496
+ The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
497
+
498
+ ```python
499
+ ws.send(
500
+ model_id=model_id,
501
+ transcript=transcript,
502
+ voice_id=voice_id,
503
+ output_format=output_format,
504
+ _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
505
+ )
506
+ ```
507
+
508
+ ### Jupyter Notebook Usage
509
+
457
510
  If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
458
511
  Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
459
512
 
@@ -384,6 +384,34 @@ p.terminate()
384
384
  ws.close() # Close the websocket connection
385
385
  ```
386
386
 
387
+ ### Generating timestamps using WebSocket
388
+
389
+ The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
390
+ - words (list): The individual words in the transcript.
391
+ - start (list): The starting timestamp for each word (in seconds).
392
+ - end (list): The ending timestamp for each word (in seconds).
393
+
394
+ ```python
395
+ response = ws.send(
396
+ model_id=model_id,
397
+ transcript=transcript,
398
+ voice_id=voice_id,
399
+ output_format=output_format,
400
+ stream=False,
401
+ add_timestamps=True
402
+ )
403
+
404
+ # Accessing the word_timestamps object
405
+ word_timestamps = response['word_timestamps']
406
+
407
+ words = word_timestamps['words']
408
+ start_times = word_timestamps['start']
409
+ end_times = word_timestamps['end']
410
+
411
+ for word, start, end in zip(words, start_times, end_times):
412
+ print(f"Word: {word}, Start: {start}, End: {end}")
413
+ ```
414
+
387
415
  ### Multilingual Text-to-Speech [Alpha]
388
416
 
389
417
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -437,6 +465,31 @@ stream.close()
437
465
  p.terminate()
438
466
  ```
439
467
 
468
+ ### Speed and Emotion Control [Experimental]
469
+
470
+ You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
471
+
472
+ Speed Options:
473
+ - `slowest`, `slow`, `normal`, `fast`, `fastest`
474
+
475
+ Emotion Options:
476
+ Use a list of tags in the format `emotion_name:level` where:
477
+ - Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
478
+ - Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
479
+ The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
480
+
481
+ ```python
482
+ ws.send(
483
+ model_id=model_id,
484
+ transcript=transcript,
485
+ voice_id=voice_id,
486
+ output_format=output_format,
487
+ _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
488
+ )
489
+ ```
490
+
491
+ ### Jupyter Notebook Usage
492
+
440
493
  If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
441
494
  Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
442
495
 
@@ -45,7 +45,7 @@ class DeprecatedOutputFormatMapping:
45
45
  "mulaw_8000": {"container": "raw", "encoding": "pcm_mulaw", "sample_rate": 8000},
46
46
  "alaw_8000": {"container": "raw", "encoding": "pcm_alaw", "sample_rate": 8000},
47
47
  }
48
-
48
+
49
49
  @classmethod
50
50
  @deprecated(
51
51
  vdeprecated="1.0.1",
@@ -70,7 +70,32 @@ class VoiceMetadata(TypedDict):
70
70
  language: str
71
71
 
72
72
 
73
+ class VoiceControls(TypedDict):
74
+ """Defines different voice control parameters for voice synthesis.
75
+
76
+ For a complete list of supported parameters, refer to the Cartesia API documentation.
77
+ https://docs.cartesia.ai/api-reference
78
+
79
+ Examples:
80
+ >>> {"speed": "fastest"}
81
+ >>> {"speed": "slow", "emotion": ["sadness:high"]}
82
+ >>> {"emotion": ["surprise:highest", "curiosity"]}
83
+
84
+ Note:
85
+ This is an experimental class and is subject to rapid change in future versions.
86
+ """
87
+
88
+ speed: str = ""
89
+ emotion: List[str] = []
90
+
91
+
73
92
  class OutputFormat(TypedDict):
74
93
  container: str
75
94
  encoding: str
76
95
  sample_rate: int
96
+
97
+
98
+ class EventType:
99
+ NULL = ""
100
+ AUDIO = "chunk"
101
+ TIMESTAMPS = "timestamps"
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import base64
3
+ from collections import defaultdict
3
4
  import json
4
5
  import os
5
6
  import uuid
@@ -27,9 +28,11 @@ from iterators import TimeoutIterator
27
28
 
28
29
  from cartesia.utils.retry import retry_on_connection_error, retry_on_connection_error_async
29
30
  from cartesia._types import (
31
+ EventType,
30
32
  OutputFormat,
31
33
  OutputFormatMapping,
32
34
  DeprecatedOutputFormatMapping,
35
+ VoiceControls,
33
36
  VoiceMetadata,
34
37
  )
35
38
 
@@ -295,6 +298,7 @@ class _TTSContext:
295
298
  context_id: Optional[str] = None,
296
299
  duration: Optional[int] = None,
297
300
  language: Optional[str] = None,
301
+ _experimental_voice_controls: Optional[VoiceControls] = None,
298
302
  ) -> Generator[bytes, None, None]:
299
303
  """Send audio generation requests to the WebSocket and yield responses.
300
304
 
@@ -307,6 +311,8 @@ class _TTSContext:
307
311
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
308
312
  duration: The duration of the audio in seconds.
309
313
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
314
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
315
+ Note: This is an experimental feature and may change rapidly in future releases.
310
316
 
311
317
  Yields:
312
318
  Dictionary containing the following key(s):
@@ -322,7 +328,11 @@ class _TTSContext:
322
328
 
323
329
  self._websocket.connect()
324
330
 
325
- voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
331
+ voice = TTS._validate_and_construct_voice(
332
+ voice_id,
333
+ voice_embedding=voice_embedding,
334
+ experimental_voice_controls=_experimental_voice_controls,
335
+ )
326
336
 
327
337
  # Create the initial request body
328
338
  request_body = {
@@ -482,42 +492,16 @@ class _WebSocket:
482
492
  def _convert_response(
483
493
  self, response: Dict[str, any], include_context_id: bool
484
494
  ) -> Dict[str, Any]:
485
- audio = base64.b64decode(response["data"])
495
+ out = {}
496
+ if response["type"] == EventType.AUDIO:
497
+ out["audio"] = base64.b64decode(response["data"])
498
+ elif response["type"] == EventType.TIMESTAMPS:
499
+ out["word_timestamps"] = response["word_timestamps"]
486
500
 
487
- optional_kwargs = {}
488
501
  if include_context_id:
489
- optional_kwargs["context_id"] = response["context_id"]
490
-
491
- return {
492
- "audio": audio,
493
- **optional_kwargs,
494
- }
495
-
496
- def _validate_and_construct_voice(
497
- self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
498
- ) -> dict:
499
- """Validate and construct the voice dictionary for the request.
500
-
501
- Args:
502
- voice_id: The ID of the voice to use for generating audio.
503
- voice_embedding: The embedding of the voice to use for generating audio.
502
+ out["context_id"] = response["context_id"]
504
503
 
505
- Returns:
506
- A dictionary representing the voice configuration.
507
-
508
- Raises:
509
- ValueError: If neither or both voice_id and voice_embedding are specified.
510
- """
511
- if voice_id is None and voice_embedding is None:
512
- raise ValueError("Either voice_id or voice_embedding must be specified.")
513
-
514
- if voice_id is not None and voice_embedding is not None:
515
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
516
-
517
- if voice_id:
518
- return {"mode": "id", "id": voice_id}
519
-
520
- return {"mode": "embedding", "embedding": voice_embedding}
504
+ return out
521
505
 
522
506
  def send(
523
507
  self,
@@ -530,6 +514,8 @@ class _WebSocket:
530
514
  duration: Optional[int] = None,
531
515
  language: Optional[str] = None,
532
516
  stream: bool = True,
517
+ add_timestamps: bool = False,
518
+ _experimental_voice_controls: Optional[VoiceControls] = None,
533
519
  ) -> Union[bytes, Generator[bytes, None, None]]:
534
520
  """Send a request to the WebSocket to generate audio.
535
521
 
@@ -543,6 +529,9 @@ class _WebSocket:
543
529
  duration: The duration of the audio in seconds.
544
530
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
545
531
  stream: Whether to stream the audio or not.
532
+ add_timestamps: Whether to return word-level timestamps.
533
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
534
+ Note: This is an experimental feature and may change rapidly in future releases.
546
535
 
547
536
  Returns:
548
537
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -556,7 +545,11 @@ class _WebSocket:
556
545
  if context_id is None:
557
546
  context_id = str(uuid.uuid4())
558
547
 
559
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
548
+ voice = TTS._validate_and_construct_voice(
549
+ voice_id,
550
+ voice_embedding=voice_embedding,
551
+ experimental_voice_controls=_experimental_voice_controls,
552
+ )
560
553
 
561
554
  request_body = {
562
555
  "model_id": model_id,
@@ -569,6 +562,7 @@ class _WebSocket:
569
562
  },
570
563
  "context_id": context_id,
571
564
  "language": language,
565
+ "add_timestamps": add_timestamps,
572
566
  }
573
567
 
574
568
  if duration is not None:
@@ -580,10 +574,17 @@ class _WebSocket:
580
574
  return generator
581
575
 
582
576
  chunks = []
577
+ word_timestamps = defaultdict(list)
583
578
  for chunk in generator:
584
- chunks.append(chunk["audio"])
585
-
586
- return {"audio": b"".join(chunks), "context_id": context_id}
579
+ if "audio" in chunk:
580
+ chunks.append(chunk["audio"])
581
+ if add_timestamps and "word_timestamps" in chunk:
582
+ for k, v in chunk["word_timestamps"].items():
583
+ word_timestamps[k].extend(v)
584
+ out = {"audio": b"".join(chunks), "context_id": context_id}
585
+ if add_timestamps:
586
+ out["word_timestamps"] = word_timestamps
587
+ return out
587
588
 
588
589
  def _websocket_generator(self, request_body: Dict[str, Any]):
589
590
  self.websocket.send(json.dumps(request_body))
@@ -656,32 +657,6 @@ class _SSE:
656
657
  break
657
658
  return buffer, outputs
658
659
 
659
- def _validate_and_construct_voice(
660
- self, voice_id: Optional[str] = None, voice_embedding: Optional[List[float]] = None
661
- ) -> dict:
662
- """Validate and construct the voice dictionary for the request.
663
-
664
- Args:
665
- voice_id: The ID of the voice to use for generating audio.
666
- voice_embedding: The embedding of the voice to use for generating audio.
667
-
668
- Returns:
669
- A dictionary representing the voice configuration.
670
-
671
- Raises:
672
- ValueError: If neither or both voice_id and voice_embedding are specified.
673
- """
674
- if voice_id is None and voice_embedding is None:
675
- raise ValueError("Either voice_id or voice_embedding must be specified.")
676
-
677
- if voice_id is not None and voice_embedding is not None:
678
- raise ValueError("Only one of voice_id or voice_embedding should be specified.")
679
-
680
- if voice_id:
681
- return {"mode": "id", "id": voice_id}
682
-
683
- return {"mode": "embedding", "embedding": voice_embedding}
684
-
685
660
  def send(
686
661
  self,
687
662
  model_id: str,
@@ -692,6 +667,7 @@ class _SSE:
692
667
  duration: Optional[int] = None,
693
668
  language: Optional[str] = None,
694
669
  stream: bool = True,
670
+ _experimental_voice_controls: Optional[VoiceControls] = None,
695
671
  ) -> Union[bytes, Generator[bytes, None, None]]:
696
672
  """Send a request to the server to generate audio using Server-Sent Events.
697
673
 
@@ -704,6 +680,8 @@ class _SSE:
704
680
  duration: The duration of the audio in seconds.
705
681
  language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
706
682
  stream: Whether to stream the audio or not.
683
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
684
+ Note: This is an experimental feature and may change rapidly in future releases.
707
685
 
708
686
  Returns:
709
687
  If `stream` is True, the method returns a generator that yields chunks. Each chunk is a dictionary.
@@ -711,8 +689,11 @@ class _SSE:
711
689
  Both the generator and the dictionary contain the following key(s):
712
690
  - audio: The audio as bytes.
713
691
  """
714
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
715
-
692
+ voice = TTS._validate_and_construct_voice(
693
+ voice_id,
694
+ voice_embedding=voice_embedding,
695
+ experimental_voice_controls=_experimental_voice_controls,
696
+ )
716
697
  request_body = {
717
698
  "model_id": model_id,
718
699
  "transcript": transcript,
@@ -826,6 +807,7 @@ class TTS(Resource):
826
807
  sample_rate=output_format_obj["sample_rate"],
827
808
  )
828
809
 
810
+ @staticmethod
829
811
  def get_sample_rate(self, output_format_name: str) -> int:
830
812
  """Convenience method to get the sample rate for a given output format.
831
813
 
@@ -849,6 +831,40 @@ class TTS(Resource):
849
831
 
850
832
  return output_format_obj["sample_rate"]
851
833
 
834
+ @staticmethod
835
+ def _validate_and_construct_voice(
836
+ voice_id: Optional[str] = None,
837
+ voice_embedding: Optional[List[float]] = None,
838
+ experimental_voice_controls: Optional[VoiceControls] = None,
839
+ ) -> dict:
840
+ """Validate and construct the voice dictionary for the request.
841
+
842
+ Args:
843
+ voice_id: The ID of the voice to use for generating audio.
844
+ voice_embedding: The embedding of the voice to use for generating audio.
845
+ experimental_voice_controls: Voice controls for emotion and speed.
846
+ Note: This is an experimental feature and may rapidly change in the future.
847
+
848
+ Returns:
849
+ A dictionary representing the voice configuration.
850
+
851
+ Raises:
852
+ ValueError: If neither or both voice_id and voice_embedding are specified.
853
+ """
854
+ if voice_id is None and voice_embedding is None:
855
+ raise ValueError("Either voice_id or voice_embedding must be specified.")
856
+
857
+ if voice_id is not None and voice_embedding is not None:
858
+ raise ValueError("Only one of voice_id or voice_embedding should be specified.")
859
+
860
+ if voice_id:
861
+ voice = {"mode": "id", "id": voice_id}
862
+ else:
863
+ voice = {"mode": "embedding", "embedding": voice_embedding}
864
+ if experimental_voice_controls is not None:
865
+ voice["__experimental_controls"] = experimental_voice_controls
866
+ return voice
867
+
852
868
 
853
869
  class AsyncCartesia(Cartesia):
854
870
  """The asynchronous version of the Cartesia client."""
@@ -946,8 +962,13 @@ class _AsyncSSE(_SSE):
946
962
  duration: Optional[int] = None,
947
963
  language: Optional[str] = None,
948
964
  stream: bool = True,
965
+ _experimental_voice_controls: Optional[VoiceControls] = None,
949
966
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
950
- voice = self._validate_and_construct_voice(voice_id, voice_embedding)
967
+ voice = TTS._validate_and_construct_voice(
968
+ voice_id,
969
+ voice_embedding=voice_embedding,
970
+ experimental_voice_controls=_experimental_voice_controls,
971
+ )
951
972
 
952
973
  request_body = {
953
974
  "model_id": model_id,
@@ -1043,6 +1064,8 @@ class _AsyncTTSContext:
1043
1064
  continue_: bool = False,
1044
1065
  duration: Optional[int] = None,
1045
1066
  language: Optional[str] = None,
1067
+ add_timestamps: bool = False,
1068
+ _experimental_voice_controls: Optional[VoiceControls] = None,
1046
1069
  ) -> None:
1047
1070
  """Send audio generation requests to the WebSocket. The response can be received using the `receive` method.
1048
1071
 
@@ -1055,7 +1078,10 @@ class _AsyncTTSContext:
1055
1078
  context_id: The context ID to use for the request. If not specified, a random context ID will be generated.
1056
1079
  continue_: Whether to continue the audio generation from the previous transcript or not.
1057
1080
  duration: The duration of the audio in seconds.
1058
- language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`
1081
+ language: The language code for the audio request. This can only be used with `model_id = sonic-multilingual`.
1082
+ add_timestamps: Whether to return word-level timestamps.
1083
+ _experimental_voice_controls: Experimental voice controls for controlling speed and emotion.
1084
+ Note: This is an experimental feature and may change rapidly in future releases.
1059
1085
 
1060
1086
  Returns:
1061
1087
  None.
@@ -1067,7 +1093,9 @@ class _AsyncTTSContext:
1067
1093
 
1068
1094
  await self._websocket.connect()
1069
1095
 
1070
- voice = self._websocket._validate_and_construct_voice(voice_id, voice_embedding)
1096
+ voice = TTS._validate_and_construct_voice(
1097
+ voice_id, voice_embedding, experimental_voice_controls=_experimental_voice_controls
1098
+ )
1071
1099
 
1072
1100
  request_body = {
1073
1101
  "model_id": model_id,
@@ -1081,6 +1109,7 @@ class _AsyncTTSContext:
1081
1109
  "context_id": self._context_id,
1082
1110
  "continue": continue_,
1083
1111
  "language": language,
1112
+ "add_timestamps": add_timestamps,
1084
1113
  }
1085
1114
 
1086
1115
  if duration is not None:
@@ -1234,7 +1263,10 @@ class _AsyncWebSocket(_WebSocket):
1234
1263
  duration: Optional[int] = None,
1235
1264
  language: Optional[str] = None,
1236
1265
  stream: bool = True,
1266
+ add_timestamps: bool = False,
1267
+ _experimental_voice_controls: Optional[VoiceControls] = None,
1237
1268
  ) -> Union[bytes, AsyncGenerator[bytes, None]]:
1269
+ """See :meth:`_WebSocket.send` for details."""
1238
1270
  if context_id is None:
1239
1271
  context_id = str(uuid.uuid4())
1240
1272
 
@@ -1250,6 +1282,8 @@ class _AsyncWebSocket(_WebSocket):
1250
1282
  duration=duration,
1251
1283
  language=language,
1252
1284
  continue_=False,
1285
+ add_timestamps=add_timestamps,
1286
+ _experimental_voice_controls=_experimental_voice_controls,
1253
1287
  )
1254
1288
 
1255
1289
  generator = ctx.receive()
@@ -1258,10 +1292,17 @@ class _AsyncWebSocket(_WebSocket):
1258
1292
  return generator
1259
1293
 
1260
1294
  chunks = []
1295
+ word_timestamps = defaultdict(list)
1261
1296
  async for chunk in generator:
1262
- chunks.append(chunk["audio"])
1263
-
1264
- return {"audio": b"".join(chunks), "context_id": context_id}
1297
+ if "audio" in chunk:
1298
+ chunks.append(chunk["audio"])
1299
+ if add_timestamps and "word_timestamps" in chunk:
1300
+ for k, v in chunk["word_timestamps"].items():
1301
+ word_timestamps[k].extend(v)
1302
+ out = {"audio": b"".join(chunks), "context_id": context_id}
1303
+ if add_timestamps:
1304
+ out["word_timestamps"] = word_timestamps
1305
+ return out
1265
1306
 
1266
1307
  async def _process_responses(self):
1267
1308
  try:
@@ -0,0 +1 @@
1
+ __version__ = "1.0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cartesia
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: The official Python library for the Cartesia API.
5
5
  Home-page:
6
6
  Author: Cartesia, Inc.
@@ -401,6 +401,34 @@ p.terminate()
401
401
  ws.close() # Close the websocket connection
402
402
  ```
403
403
 
404
+ ### Generating timestamps using WebSocket
405
+
406
+ The WebSocket endpoint supports timestamps, allowing you to get detailed timing information for each word in the transcript. To enable this feature, pass an `add_timestamps` boolean flag to the `send` method. The results are returned in the `word_timestamps` object, which contains three keys:
407
+ - words (list): The individual words in the transcript.
408
+ - start (list): The starting timestamp for each word (in seconds).
409
+ - end (list): The ending timestamp for each word (in seconds).
410
+
411
+ ```python
412
+ response = ws.send(
413
+ model_id=model_id,
414
+ transcript=transcript,
415
+ voice_id=voice_id,
416
+ output_format=output_format,
417
+ stream=False,
418
+ add_timestamps=True
419
+ )
420
+
421
+ # Accessing the word_timestamps object
422
+ word_timestamps = response['word_timestamps']
423
+
424
+ words = word_timestamps['words']
425
+ start_times = word_timestamps['start']
426
+ end_times = word_timestamps['end']
427
+
428
+ for word, start, end in zip(words, start_times, end_times):
429
+ print(f"Word: {word}, Start: {start}, End: {end}")
430
+ ```
431
+
404
432
  ### Multilingual Text-to-Speech [Alpha]
405
433
 
406
434
  You can use our `sonic-multilingual` model to generate audio in multiple languages. The languages supported are available at [docs.cartesia.ai](https://docs.cartesia.ai/getting-started/available-models).
@@ -454,6 +482,31 @@ stream.close()
454
482
  p.terminate()
455
483
  ```
456
484
 
485
+ ### Speed and Emotion Control [Experimental]
486
+
487
+ You can enhance the voice output by adjusting the `speed` and `emotion` parameters. To do this, pass a `_experimental_voice_controls` dictionary with the desired `speed` and `emotion` values to any `send` method.
488
+
489
+ Speed Options:
490
+ - `slowest`, `slow`, `normal`, `fast`, `fastest`
491
+
492
+ Emotion Options:
493
+ Use a list of tags in the format `emotion_name:level` where:
494
+ - Emotion Names: `anger`, `positivity`, `surprise`, `sadness`, `curiosity`
495
+ - Levels: `lowest`, `low`, (omit for medium level), `high`, `highest`
496
+ The emotion tag levels add the specified emotion to the voice at the indicated intensity, with the omission of a level tag resulting in a medium intensity.
497
+
498
+ ```python
499
+ ws.send(
500
+ model_id=model_id,
501
+ transcript=transcript,
502
+ voice_id=voice_id,
503
+ output_format=output_format,
504
+ _experimental_voice_controls={"speed": "fast", "emotion": ["positivity:high"]},
505
+ )
506
+ ```
507
+
508
+ ### Jupyter Notebook Usage
509
+
457
510
  If you are using Jupyter Notebook or JupyterLab, you can use IPython.display.Audio to play the generated audio directly in the notebook.
458
511
  Additionally, in these notebook examples we show how to use the client as a context manager (though this is not required).
459
512
 
@@ -10,7 +10,7 @@ import os
10
10
  import sys
11
11
  from cartesia import AsyncCartesia, Cartesia
12
12
  from cartesia.client import DEFAULT_MODEL_ID, MULTILINGUAL_MODEL_ID
13
- from cartesia._types import VoiceMetadata
13
+ from cartesia._types import VoiceControls, VoiceMetadata
14
14
  from typing import AsyncGenerator, Generator, List
15
15
  import numpy as np
16
16
  import pytest
@@ -19,9 +19,11 @@ import asyncio
19
19
 
20
20
  THISDIR = os.path.dirname(__file__)
21
21
  sys.path.insert(0, os.path.dirname(THISDIR))
22
+ RESOURCES_DIR = os.path.join(THISDIR, "resources")
22
23
 
23
24
  SAMPLE_VOICE = "Newsman"
24
25
  SAMPLE_VOICE_ID = "d46abd1d-2d02-43e8-819f-51fb652c1c61"
26
+ EXPERIMENTAL_VOICE_CONTROLS = {"emotion": ["anger:high", "positivity:low"], "speed": "fastest"}
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29
 
@@ -84,7 +86,12 @@ def test_get_voice_from_id(client: Cartesia):
84
86
  # cloned_voice_embedding = client.voices.clone(link=url)
85
87
  # assert isinstance(cloned_voice_embedding, list)
86
88
  # assert len(cloned_voice_embedding) == 192
87
-
89
+
90
+ def test_clone_voice_with_file(client: Cartesia):
91
+ logger.info("Testing voices.clone with file")
92
+ output = client.voices.clone(filepath=os.path.join(RESOURCES_DIR, "sample-speech-4s.wav"))
93
+ assert isinstance(output, list)
94
+
88
95
  def test_create_voice(client: Cartesia):
89
96
  logger.info("Testing voices.create")
90
97
  embedding = np.ones(192).tolist()
@@ -96,7 +103,8 @@ def test_create_voice(client: Cartesia):
96
103
  assert voice in voices
97
104
 
98
105
  @pytest.mark.parametrize("stream", [True, False])
99
- def test_sse_send(resources: _Resources, stream: bool):
106
+ @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
107
+ def test_sse_send(resources: _Resources, stream: bool, _experimental_voice_controls: VoiceControls):
100
108
  logger.info("Testing SSE send")
101
109
  client = resources.client
102
110
  transcript = "Hello, world! I'\''m generating audio on Cartesia."
@@ -105,7 +113,7 @@ def test_sse_send(resources: _Resources, stream: bool):
105
113
  "container": "raw",
106
114
  "encoding": "pcm_f32le",
107
115
  "sample_rate": 44100
108
- }, stream=stream, model_id=DEFAULT_MODEL_ID)
116
+ }, stream=stream, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
109
117
 
110
118
  if not stream:
111
119
  output_generate = [output_generate]
@@ -132,7 +140,8 @@ def test_sse_send_with_model_id(resources: _Resources, stream: bool):
132
140
  assert isinstance(out["audio"], bytes)
133
141
 
134
142
  @pytest.mark.parametrize("stream", [True, False])
135
- def test_websocket_send(resources: _Resources, stream: bool):
143
+ @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
144
+ def test_websocket_send(resources: _Resources, stream: bool, _experimental_voice_controls: VoiceControls):
136
145
  logger.info("Testing WebSocket send")
137
146
  client = resources.client
138
147
  transcript = "Hello, world! I'\''m generating audio on Cartesia."
@@ -143,7 +152,7 @@ def test_websocket_send(resources: _Resources, stream: bool):
143
152
  "container": "raw",
144
153
  "encoding": "pcm_f32le",
145
154
  "sample_rate": 44100
146
- }, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id)
155
+ }, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id, _experimental_voice_controls=_experimental_voice_controls)
147
156
 
148
157
  if not stream:
149
158
  output_generate = [output_generate]
@@ -152,8 +161,37 @@ def test_websocket_send(resources: _Resources, stream: bool):
152
161
  assert isinstance(out["audio"], bytes)
153
162
 
154
163
  ws.close()
164
+
165
+
166
+ @pytest.mark.parametrize("stream", [True, False])
167
+ def test_websocket_send_timestamps(resources: _Resources, stream: bool):
168
+ logger.info("Testing WebSocket send")
169
+ client = resources.client
170
+ transcript = "Hello, world! I'\''m generating audio on Cartesia."
171
+
172
+ ws = client.tts.websocket()
173
+ context_id = str(uuid.uuid4())
174
+ output_generate = ws.send(transcript=transcript, voice_id=SAMPLE_VOICE_ID, output_format={
175
+ "container": "raw",
176
+ "encoding": "pcm_f32le",
177
+ "sample_rate": 44100
178
+ }, stream=stream, model_id=DEFAULT_MODEL_ID, context_id=context_id, add_timestamps=True)
179
+
180
+ if not stream:
181
+ output_generate = [output_generate]
182
+
183
+ has_wordtimestamps = False
184
+ for out in output_generate:
185
+ has_wordtimestamps |= "word_timestamps" in out
186
+ _validate_schema(out)
187
+
188
+ assert has_wordtimestamps, "No word timestamps found"
189
+
190
+ ws.close()
191
+
155
192
 
156
- def test_sse_send_context_manager(resources: _Resources):
193
+ @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
194
+ def test_sse_send_context_manager(resources: _Resources, _experimental_voice_controls: VoiceControls):
157
195
  logger.info("Testing SSE send context manager")
158
196
  transcript = "Hello, world! I'\''m generating audio on Cartesia."
159
197
 
@@ -162,7 +200,7 @@ def test_sse_send_context_manager(resources: _Resources):
162
200
  "container": "raw",
163
201
  "encoding": "pcm_f32le",
164
202
  "sample_rate": 44100
165
- }, stream=True, model_id=DEFAULT_MODEL_ID)
203
+ }, stream=True, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
166
204
  assert isinstance(output_generate, Generator)
167
205
 
168
206
  for out in output_generate:
@@ -183,7 +221,7 @@ def test_sse_send_context_manager_with_err():
183
221
  raise RuntimeError("Expected error to be thrown")
184
222
  except Exception:
185
223
  pass
186
-
224
+
187
225
  def test_websocket_send_context_manager(resources: _Resources):
188
226
  logger.info("Testing WebSocket send context manager")
189
227
  transcript = "Hello, world! I'\''m generating audio on Cartesia."
@@ -216,9 +254,10 @@ def test_websocket_send_context_manage_err(resources: _Resources):
216
254
  raise RuntimeError("Expected error to be thrown")
217
255
  except Exception:
218
256
  pass
219
-
257
+
220
258
  @pytest.mark.asyncio
221
- async def test_async_sse_send(resources: _Resources):
259
+ @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
260
+ async def test_async_sse_send( resources: _Resources, _experimental_voice_controls: VoiceControls):
222
261
  logger.info("Testing async SSE send")
223
262
  transcript = "Hello, world! I'\''m generating audio on Cartesia."
224
263
 
@@ -228,7 +267,7 @@ async def test_async_sse_send(resources: _Resources):
228
267
  "container": "raw",
229
268
  "encoding": "pcm_f32le",
230
269
  "sample_rate": 44100
231
- }, stream=True, model_id=DEFAULT_MODEL_ID)
270
+ }, stream=True, model_id=DEFAULT_MODEL_ID, _experimental_voice_controls=_experimental_voice_controls)
232
271
 
233
272
  async for out in output:
234
273
  assert out.keys() == {"audio"}
@@ -238,7 +277,8 @@ async def test_async_sse_send(resources: _Resources):
238
277
  await async_client.close()
239
278
 
240
279
  @pytest.mark.asyncio
241
- async def test_async_websocket_send(resources: _Resources):
280
+ @pytest.mark.parametrize("_experimental_voice_controls", [None, EXPERIMENTAL_VOICE_CONTROLS])
281
+ async def test_async_websocket_send(resources: _Resources, _experimental_voice_controls: VoiceControls):
242
282
  logger.info("Testing async WebSocket send")
243
283
  transcript = "Hello, world! I'\''m generating audio on Cartesia."
244
284
 
@@ -250,7 +290,7 @@ async def test_async_websocket_send(resources: _Resources):
250
290
  "container": "raw",
251
291
  "encoding": "pcm_f32le",
252
292
  "sample_rate": 44100,
253
- }, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id)
293
+ }, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id, _experimental_voice_controls=_experimental_voice_controls)
254
294
 
255
295
  async for out in output:
256
296
  assert out.keys() == {"audio", "context_id"}
@@ -259,7 +299,37 @@ async def test_async_websocket_send(resources: _Resources):
259
299
  # Close the websocket
260
300
  await ws.close()
261
301
  await async_client.close()
302
+
303
+
304
+ @pytest.mark.asyncio
305
+ async def test_async_websocket_send_timestamps(resources: _Resources):
306
+ logger.info("Testing async WebSocket send with timestamps")
307
+ transcript = "Hello, world! I'\''m generating audio on Cartesia."
308
+
309
+ async_client = create_async_client()
310
+ ws = await async_client.tts.websocket()
311
+ context_id = str(uuid.uuid4())
312
+ try:
313
+ output = await ws.send(transcript=transcript, voice_id=SAMPLE_VOICE_ID, output_format={
314
+ "container": "raw",
315
+ "encoding": "pcm_f32le",
316
+ "sample_rate": 44100,
317
+ }, stream=True, model_id=DEFAULT_MODEL_ID, context_id=context_id, add_timestamps=True)
318
+
319
+ has_wordtimestamps = False
320
+ async for out in output:
321
+ assert "context_id" in out
322
+ has_wordtimestamps |= "word_timestamps" in out
323
+ _validate_schema(out)
262
324
 
325
+ assert has_wordtimestamps, "No word timestamps found"
326
+
327
+ finally:
328
+ # Close the websocket
329
+ await ws.close()
330
+ await async_client.close()
331
+
332
+
263
333
  @pytest.mark.asyncio
264
334
  async def test_async_sse_send_context_manager(resources: _Resources):
265
335
  logger.info("Testing async SSE send context manager")
@@ -766,3 +836,16 @@ def test_websocket_send_with_incorrect_url():
766
836
  ws.close()
767
837
  except Exception as e:
768
838
  logger.info("Unexpected error occured: ", e)
839
+
840
+
841
+ def _validate_schema(out):
842
+ if "audio" in out:
843
+ assert isinstance(out["audio"], bytes)
844
+ if "word_timestamps" in out:
845
+ assert isinstance(out["word_timestamps"], dict)
846
+ word_timestamps = out["word_timestamps"]
847
+
848
+ assert word_timestamps.keys() == {"words", "start", "end"}
849
+ assert isinstance(word_timestamps["words"], list) and all(isinstance(word, str) for word in word_timestamps["words"])
850
+ assert isinstance(word_timestamps["start"], list) and all(isinstance(start, (int, float)) for start in word_timestamps["start"])
851
+ assert isinstance(word_timestamps["end"], list) and all(isinstance(end, (int, float)) for end in word_timestamps["end"])
@@ -1 +0,0 @@
1
- __version__ = "1.0.5"
File without changes
File without changes
File without changes
File without changes
File without changes